예제 #1
0
    def consumer(self, que):
        a = []
        b = []
        while True:
            can_read, _, _ = select.select(que, [], [])
            for r in can_read:
                print(threading.current_thread().getName(), "============")
                item = r.get()
                print(item)
                if item == "start":
                    print("启动服务")
                    logger.info(
                        "启动程序   HeartSchedule >>> {}    ClusterServer >>>{}".
                        format(":".join(CONFIG["sch_grpc_server"]),
                               CONFIG.get("master_port")))
                    p = spawn(target=login_and_update, name="heartbeat")

                    cs = ClusterServer(
                        addr="0.0.0.0:{}".format(CONFIG.get("master_port")))
                    cs.start()
                    a.append(p)
                    b.append(cs)
                else:
                    print("关闭服务")
                    logger.info("关闭程序>>>HeartSchedule>>>ClusterServer")

                    if b or a:
                        b.pop().stop()
                        stop_thread(a.pop())
예제 #2
0
    def start_heart_cluster(self):
        # 循环检测需要一个标识判断
        flag = True
        a = []
        b = []
        while True:
            time.sleep(.5)
            if up_cluster_event.is_set() and flag:
                logger.info(
                    "启动程序   HeartSchedule >>> {}    ClusterServer >>>{}".
                    format(":".join(CONFIG["sch_grpc_server"]),
                           CONFIG.get("master_port")))
                p = spawn(target=login_and_update, name="heartbeat")
                cs = ClusterServer(
                    addr="0.0.0.0:{}".format(CONFIG.get("master_port")))
                cs.start()
                a.append(p)
                b.append(cs)
                flag = False

            elif not up_cluster_event.is_set():
                flag = True
                if all([a, b]):
                    logger.info("关闭程序>>>HeartSchedule>>>ClusterServer")
                    b.pop().stop()
                    stop_thread(a.pop())
예제 #3
0
파일: raft_main.py 프로젝트: alan-mi/8.16
def run_server():
    p = spawn(target=vip_load, name="find_vip")
    try:
        server = grpc.server(ThreadPoolExecutor(40))
        # 将对应的任务处理函数添加到rpc server中
        raft_grpc_pb2_grpc.add_RaftServiceServicer_to_server(
            raft_grpc_server.RaftService(), server)
        # 这里使用的非安全接口,世界gRPC支持TLS/SSL安全连接,以及各种鉴权机制
        server.add_insecure_port("0.0.0.0:{}".format(
            CONFIG.get("raft_grpc_port")))
        server.start()
        # 开启服务
        # TODO 开启进程选举会报错
        app.run(
            host='0.0.0.0',
            # 8586端口 只是在muster启动
            port=CONFIG.get("raft_http_port"),
        )
    except Exception as e:
        logger.info(e)

    finally:
        stop_thread(p)
        dc_vip.vip.set_vip("down")
        exit("退出")
예제 #4
0
 def chooice_use_gpu_by_num(self, need_gpus, task_id):
     task = {}
     mac = {}
     for model in need_gpus:
         rule = {
             "heartBeat": {
                 "$gte": min_time_lead(MONGOHEARTIME)
             },
             "gpus.status": None,
             "gpus.model": model["model"]
         }
         res = self.table.find(rule).sort('heartBeat', pymongo.DESCENDING)
         logger.info(model["count"])
         for machine in res:
             for gpu in machine["gpus"]:
                 if gpu.get("model") == model["model"] and not gpu.get(
                         "status") and model["count"] > 0:
                     model["count"] -= 1
                     gpu["status"] = task_id
                     if not mac.get(machine["machineID"]):
                         mac[machine["machineID"]] = [gpu["id"]]
                     else:
                         mac[machine["machineID"]].append(gpu["id"])
             self.table.update({"_id": machine["_id"]}, machine)
             if model["count"] == 0:
                 break
     task["taskID"] = task_id
     task["machines"] = mac
     logger.info(task)
     return task
예제 #5
0
    def TaskStatus(self, request, context):

        version = request.version
        seq = request.seq
        timestamp = request.timestamp
        body = request.body
        body = json.loads(body)
        err = {}

        if body["status"] == "start":
            logger.info("任务开始...")
            gpus = body["gpus"]
            if cli.compare_gpu(gpus):
                all_chooice_machine = cli.chooice_use_gpu_by_num(
                    gpus, task_id=body["taskID"])
                print(all_chooice_machine)
                print("GPU发送到任意机器")
                err.update({"msg": "ok", "status": 2})
                body.update(all_chooice_machine["machines"])
                try:
                    res = cli.table.find_one({"gpus.status": body["taskID"]})
                    with grpc.insecure_channel(
                            res["intranetAddress"]) as channel:
                        stub = agent_pb2_grpc.AgentServerStub(channel=channel)

                        stub.TaskStart(agent_response(body))
                except KeyError:
                    err.update(msg="Not taskID")
                except Exception as e:
                    print(e)

            else:
                err = {"msg": "gpu_not_free", "status": 1}
                logger.info("GPU Not Enough")
        if body["status"] == "stop":
            logger.info("任务停止...")
            print("根据taskid找出任意机器发送停止任务")
            try:
                res = cli.table.find_one({"gpus.status": body["taskID"]})
                cli.free_gpu_by_task_id(body["taskID"])
                with grpc.insecure_channel(res["intranetAddress"]) as channel:
                    stub = agent_pb2_grpc.AgentServerStub(channel=channel)
                    stub.TaskStop(agent_response(body))
            except KeyError:
                err.update({"msg": "Not taskID", "status": 1})
            except Exception as e:
                logger.info(e)
        if body["status"] == "finish":
            logger.info("任务完成...")
            print("根据taskid释放gpu发送完成状态给调度")
            cli.free_gpu_by_task_id(body["taskID"])

        return sch_response(err)
예제 #6
0
    def wrapper(self, *args, **kwargs):
        count = 1
        while True:
            try:
                self.client.admin.command("ping")
                logger.info("连接mongo")
            except Exception as e:
                logger.info("第{}连接数据库失败...{}".format(count, e))
                count = count + 1
            else:
                break
            if count == 4:
                exit("退出")

        return func(self, *args, **kwargs)
예제 #7
0
파일: raft_main.py 프로젝트: alan-mi/8.16
def vip_load():
    logger.info(vip_event.is_set())
    with grpc.insecure_channel("0.0.0.0:{}".format(
            CONFIG.get("raft_grpc_port"))) as chan:
        stub = raft_grpc_pb2_grpc.RaftServiceStub(channel=chan)

        # p = multiprocessing.Process(target=send_status_to_schedule, args=())
        # p.daemon = True
        from cluster_master.cluster import q1
        while True:
            time.sleep(3)
            ts = int(time.time())

            try:
                res_f = stub.GetStatus.future(
                    raft_grpc_pb2.GetStatusReq(ts=str(ts)), timeout=3)
                if res_f.result().ts == str(ts):
                    raft_status = json.loads(res_f.result().status)
                    # raft_status = raft_init.raft_obj.getStatus()
                    logger.info(
                        "vip_event {},leader {}, self_node {},isReady {}".
                        format(vip_event.is_set(), raft_status['leader'],
                               raft_status['self'], raft_status["isReady"]))

                    if vip_event.is_set(
                    ) and raft_status['leader'] == raft_status[
                            'self'] and raft_status["state"] == 2:
                        dc_vip.vip.set_vip("up")
                        vip_event.clear()
                        up_cluster_event.set()
                        q1.put("start")
                        # logger.info("启动>>>cluster_server")
                        # cs = ClusterServer(addr="0.0.0.0:8300")
                        # cs.start()
                        # cli.append(cs)
                    if not vip_event.is_set(
                    ) and raft_status['leader'] != raft_status['self']:
                        dc_vip.vip.set_vip("down")
                        vip_event.set()
                        up_cluster_event.clear()
                        q1.put("stop")

                        # cli.pop().stop()
            except Exception as e:
                logger.info(e)
                # logger.info("停止>>>cluster_server")
                # for i in cli:
                #     i.stop()
                continue
예제 #8
0
 def test(self):
     while True:
         time.sleep(1)
         logger.info("test")
예제 #9
0
 def raft_server(self):
     logger.info("启动raft程序...")
     run_server()