def consumer(self, que): a = [] b = [] while True: can_read, _, _ = select.select(que, [], []) for r in can_read: print(threading.current_thread().getName(), "============") item = r.get() print(item) if item == "start": print("启动服务") logger.info( "启动程序 HeartSchedule >>> {} ClusterServer >>>{}". format(":".join(CONFIG["sch_grpc_server"]), CONFIG.get("master_port"))) p = spawn(target=login_and_update, name="heartbeat") cs = ClusterServer( addr="0.0.0.0:{}".format(CONFIG.get("master_port"))) cs.start() a.append(p) b.append(cs) else: print("关闭服务") logger.info("关闭程序>>>HeartSchedule>>>ClusterServer") if b or a: b.pop().stop() stop_thread(a.pop())
def start_heart_cluster(self): # 循环检测需要一个标识判断 flag = True a = [] b = [] while True: time.sleep(.5) if up_cluster_event.is_set() and flag: logger.info( "启动程序 HeartSchedule >>> {} ClusterServer >>>{}". format(":".join(CONFIG["sch_grpc_server"]), CONFIG.get("master_port"))) p = spawn(target=login_and_update, name="heartbeat") cs = ClusterServer( addr="0.0.0.0:{}".format(CONFIG.get("master_port"))) cs.start() a.append(p) b.append(cs) flag = False elif not up_cluster_event.is_set(): flag = True if all([a, b]): logger.info("关闭程序>>>HeartSchedule>>>ClusterServer") b.pop().stop() stop_thread(a.pop())
def run_server(): p = spawn(target=vip_load, name="find_vip") try: server = grpc.server(ThreadPoolExecutor(40)) # 将对应的任务处理函数添加到rpc server中 raft_grpc_pb2_grpc.add_RaftServiceServicer_to_server( raft_grpc_server.RaftService(), server) # 这里使用的非安全接口,世界gRPC支持TLS/SSL安全连接,以及各种鉴权机制 server.add_insecure_port("0.0.0.0:{}".format( CONFIG.get("raft_grpc_port"))) server.start() # 开启服务 # TODO 开启进程选举会报错 app.run( host='0.0.0.0', # 8586端口 只是在muster启动 port=CONFIG.get("raft_http_port"), ) except Exception as e: logger.info(e) finally: stop_thread(p) dc_vip.vip.set_vip("down") exit("退出")
def chooice_use_gpu_by_num(self, need_gpus, task_id): task = {} mac = {} for model in need_gpus: rule = { "heartBeat": { "$gte": min_time_lead(MONGOHEARTIME) }, "gpus.status": None, "gpus.model": model["model"] } res = self.table.find(rule).sort('heartBeat', pymongo.DESCENDING) logger.info(model["count"]) for machine in res: for gpu in machine["gpus"]: if gpu.get("model") == model["model"] and not gpu.get( "status") and model["count"] > 0: model["count"] -= 1 gpu["status"] = task_id if not mac.get(machine["machineID"]): mac[machine["machineID"]] = [gpu["id"]] else: mac[machine["machineID"]].append(gpu["id"]) self.table.update({"_id": machine["_id"]}, machine) if model["count"] == 0: break task["taskID"] = task_id task["machines"] = mac logger.info(task) return task
def TaskStatus(self, request, context): version = request.version seq = request.seq timestamp = request.timestamp body = request.body body = json.loads(body) err = {} if body["status"] == "start": logger.info("任务开始...") gpus = body["gpus"] if cli.compare_gpu(gpus): all_chooice_machine = cli.chooice_use_gpu_by_num( gpus, task_id=body["taskID"]) print(all_chooice_machine) print("GPU发送到任意机器") err.update({"msg": "ok", "status": 2}) body.update(all_chooice_machine["machines"]) try: res = cli.table.find_one({"gpus.status": body["taskID"]}) with grpc.insecure_channel( res["intranetAddress"]) as channel: stub = agent_pb2_grpc.AgentServerStub(channel=channel) stub.TaskStart(agent_response(body)) except KeyError: err.update(msg="Not taskID") except Exception as e: print(e) else: err = {"msg": "gpu_not_free", "status": 1} logger.info("GPU Not Enough") if body["status"] == "stop": logger.info("任务停止...") print("根据taskid找出任意机器发送停止任务") try: res = cli.table.find_one({"gpus.status": body["taskID"]}) cli.free_gpu_by_task_id(body["taskID"]) with grpc.insecure_channel(res["intranetAddress"]) as channel: stub = agent_pb2_grpc.AgentServerStub(channel=channel) stub.TaskStop(agent_response(body)) except KeyError: err.update({"msg": "Not taskID", "status": 1}) except Exception as e: logger.info(e) if body["status"] == "finish": logger.info("任务完成...") print("根据taskid释放gpu发送完成状态给调度") cli.free_gpu_by_task_id(body["taskID"]) return sch_response(err)
def wrapper(self, *args, **kwargs): count = 1 while True: try: self.client.admin.command("ping") logger.info("连接mongo") except Exception as e: logger.info("第{}连接数据库失败...{}".format(count, e)) count = count + 1 else: break if count == 4: exit("退出") return func(self, *args, **kwargs)
def vip_load(): logger.info(vip_event.is_set()) with grpc.insecure_channel("0.0.0.0:{}".format( CONFIG.get("raft_grpc_port"))) as chan: stub = raft_grpc_pb2_grpc.RaftServiceStub(channel=chan) # p = multiprocessing.Process(target=send_status_to_schedule, args=()) # p.daemon = True from cluster_master.cluster import q1 while True: time.sleep(3) ts = int(time.time()) try: res_f = stub.GetStatus.future( raft_grpc_pb2.GetStatusReq(ts=str(ts)), timeout=3) if res_f.result().ts == str(ts): raft_status = json.loads(res_f.result().status) # raft_status = raft_init.raft_obj.getStatus() logger.info( "vip_event {},leader {}, self_node {},isReady {}". format(vip_event.is_set(), raft_status['leader'], raft_status['self'], raft_status["isReady"])) if vip_event.is_set( ) and raft_status['leader'] == raft_status[ 'self'] and raft_status["state"] == 2: dc_vip.vip.set_vip("up") vip_event.clear() up_cluster_event.set() q1.put("start") # logger.info("启动>>>cluster_server") # cs = ClusterServer(addr="0.0.0.0:8300") # cs.start() # cli.append(cs) if not vip_event.is_set( ) and raft_status['leader'] != raft_status['self']: dc_vip.vip.set_vip("down") vip_event.set() up_cluster_event.clear() q1.put("stop") # cli.pop().stop() except Exception as e: logger.info(e) # logger.info("停止>>>cluster_server") # for i in cli: # i.stop() continue
def test(self): while True: time.sleep(1) logger.info("test")
def raft_server(self): logger.info("启动raft程序...") run_server()