def delete_cluster_task(log_type, log_pk): """删除集群 """ model = models.log_factory(log_type) if not model: logger.error("log not found for type: %s", log_type) return log = model.objects.filter(pk=log_pk).last() if not log: logger.error("log not found for pk: %s", log_pk) return if log.is_finished: logger.error("log[%s] has been finished", log_pk) return if not log.is_polling: log.is_polling = True log.save() end_time = datetime.now() + DELETE_POLLING_TIMEOUT is_terminaled = False while not log.is_finished and log.is_polling: time.sleep(POLLING_INTERVAL_SECONDS) if datetime.now() > end_time: break try: log, is_terminaled = _polling_host_module_once(log, _("1.修改主机模块")) except Exception as err: logger.exception(err) if is_terminaled: return # 执行任务 delete_handler(log)
def common_model_handler(log_type, log_pk, task_id_flag=False): """针对model的统一处理 """ if not log_pk: return model = models.log_factory(log_type) if not model: logger.error("log not found for type: %s", log_type) return log = model.objects.filter(pk=log_pk).last() if not log: logger.error("log not found for pk: %s", log_pk) return if not log.task_id and task_id_flag: logger.error("task id is null for pk: %s", log_pk) return if log.is_finished: logger.info("log[%s] has been finished", log_pk) return if log.is_polling: logger.warning("log[%s] is polling", log_pk) else: log.is_polling = True log.save() return model, log
def chain_polling_bke_status(log_info): """检查BKE Agent是否安装成功 """ if not log_info: return log_pk, log_type = log_info model = models.log_factory(log_type) log = model.objects.filter(pk=log_pk).last() new_log = models.NodeUpdateLog.objects.create( # noqa project_id=log.project_id, cluster_id=log.cluster_id, token=log.token, node_id=log.node_id, params=log.params, operator=log.operator, oper_type=models.NodeOperType.BkeInstall ) try: bke_cluster_info = helm_init(new_log.token, new_log.project_id, new_log.cluster_id, 'kube-system') except Exception as err: logger.error("Install bke error, token: %s, project_id: %s, cluster_id: %s, error detail: %s" % (new_log.token, new_log.project_id, new_log.cluster_id, err)) bke_log_save(new_log, log_type, models.NodeStatus.BkeFailed, message=err) return # 异常记录LOG if bke_cluster_info.get("code") != ErrorCode.NoError: message = bke_cluster_info.get("message") logger.error("Install bke error, detail: %s" % message) bke_log_save(new_log, log_type, models.NodeStatus.BkeFailed, message) return _polling_bke_status(new_log.id)
def polling_so_init(old_log, log_pk=None, log_type=None): """SO初始化 """ if not (old_log or (log_pk and log_type)): return if old_log: log_pk, log_type = old_log model = models.log_factory(log_type) if not model: logger.error("log not found for type: %s", log_type) return log = model.objects.filter(pk=log_pk).last() if not log: logger.error("log not found for pk: %s", log_pk) end_time = datetime.now() + POLLING_TIMEOUT while not log.is_finished and log.is_polling: time.sleep(POLLING_INTERVAL_SECONDS) if datetime.now() > end_time: break try: log = _polling_so_initial_once(model, log) except Exception as err: logger.exception(err) continue # model.objects.filter(pk=log_pk).update(is_polling=False) if log.status in [models.CommonStatus.SoInitialFailed]: push_sentry(log, _("SO初始化失败")) # 更改node或集群状态 update_node_cluster_check_status(log, log_type, status=log.status) return return log_pk, log_type
def so_init(old_log, request=None): """SO初始化 """ if not old_log: return log_pk, log_type = old_log model = models.log_factory(log_type) if not model: logger.error("log not found for type: %s", log_type) return log = model.objects.filter(pk=log_pk).last() if not log: logger.error("log not found for pk: %s", log_pk) user_token = log.token username = log.operator if request: user_token = request.user.token.access_token username = request.user.username try: params = json.loads(log.params) except Exception: params = {} save_params = { "project_id": log.project_id, "cluster_id": log.cluster_id, "token": user_token, "status": models.CommonStatus.SoInitial, "params": log.params, "operator": username, "oper_type": SO_INITIAL } if log_type != "ClusterInstallLog": save_params["node_id"] = log.node_id new_log = model.objects.create(**save_params) # 触发初始化检查任务 ip_list = params.get("master_ips") or params.get("node_info", {}).keys() resp = so.initial_host(username, ip_list or []) task_id = (resp.get("data") or {}).get("job_id") if not resp.get("result") or not task_id: new_log.is_finished = True new_log.is_polling = False new_log.status = models.CommonStatus.SoInitialFailed new_log.log = json.dumps({ "state": "FAILURE", "node_tasks": [{ "state": "FAILURE", "name": f"{_('1.SO初始化失败')}: {resp.get('message')}" }] }) new_log.save() update_node_cluster_check_status( new_log, log_type, status=models.CommonStatus.SoInitialFailed) return new_log.task_id = task_id new_log.is_polling = True new_log.save() return new_log.id, log_type
def node_exec_bcs_task(old_log, request=None): """执行bcs创建节点任务 """ if not old_log: return log_pk, log_type = old_log model = models.log_factory(log_type) if not model: logger.error("log not found for type: %s", log_type) return log = model.objects.filter(pk=log_pk).last() if not log: logger.error("log not found for pk: %s", log_pk) # 解析参数 try: params = json.loads(log.params) except Exception: params = {} node_info = params.get("node_info") or {} user_token = log.token username = log.operator project_id = log.project_id cluster_id = log.cluster_id if request: user_token = request.user.token.access_token username = request.user.username new_log = models.NodeUpdateLog.objects.create( # noqa project_id=project_id, cluster_id=cluster_id, token=user_token, node_id=",".join(node_info.values()), params=json.dumps(params), operator=username, ) try: client = BCSClient( user_token, project_id, cluster_id, None ) rsp = client.add_cluster_node( params.get("kind_name"), username, list(node_info.keys()), params.get("cc_app_id") ) except Exception as error: logger.error("add add_cluster_node error: %s", error) node_ip_status(new_log, new_log.project_id, new_log.cluster_id, node_info) return if rsp.get("code") != ErrorCode.NoError: node_ip_status(new_log, new_log.project_id, new_log.cluster_id, node_info) push_sentry(new_log, _("节点初始化失败")) return data = rsp.get("data") or {} taskid = data.get("taskID") new_log.task_id = taskid new_log.save() return new_log.id
def chain_polling_task(log_pk, log_type): if not log_pk: return model = models.log_factory(log_type) if not model: logger.error("log not found for type: %s", log_type) return log = model.objects.filter(pk=log_pk).last() if not log: logger.error("log not found for pk: %s", log_pk) return if not log.task_id: logger.error("task id is null for pk: %s", log_pk) return if log.is_finished: logger.info("log[%s] has been finished", log_pk) return if log.is_polling: logger.warning("log[%s] is polling", log_pk) else: log.is_polling = True log.save() end_time = datetime.now() + POLLING_TIMEOUT while not log.is_finished and log.is_polling: time.sleep(POLLING_INTERVAL_SECONDS) if datetime.now() > end_time: break try: log = _polling_once(model, log) except Exception as err: logger.exception(err) model.objects.filter(pk=log_pk).update(is_polling=False) # 出现异常,不影响流程 try: register_ns(log) except Exception as err: logger.error("Register default namespace: %s" % err) # TODO:待op系统上线后,下掉以下通知 try: if log_type == "ClusterInstallLog" and log.oper_type in [ "initialize", "reinstall" ] and log.status == "normal": send_msg_for_cluster(log) except Exception as err: logger.error("Send cluster info failed: %s" % err) if (log_type == "NodeUpdateLog") and ( log.status == models.CommonStatus.Normal) and ('K8S' in log.cluster_id): return log.id, log_type
def polling_task(log_type, log_pk): model = models.log_factory(log_type) if not model: logger.error("log not found for type: %s", log_type) return log = model.objects.filter(pk=log_pk).last() if not log: logger.error("log not found for pk: %s", log_pk) return if not log.task_id: logger.error("task id is null for pk: %s", log_pk) return if log.is_finished: logger.info("log[%s] has been finished", log_pk) return if log.is_polling: logger.warning("log[%s] is polling", log_pk) else: log.is_polling = True log.save() end_time = datetime.now() + POLLING_TIMEOUT while not log.is_finished and log.is_polling: time.sleep(POLLING_INTERVAL_SECONDS) if datetime.now() > end_time: break try: log = _polling_once(model, log) except Exception as err: logger.exception(err) model.objects.filter(pk=log_pk).update(is_polling=False) # 异常时,上报sentry if log.status != models.CommonStatus.Normal: initial_oper_list = [INITIAL_CHECK, INITIALIZE, SO_INITIAL, REINSTALL] if log_type == "ClusterInstallLog": if log.oper_type in initial_oper_list: prefix_msg = _("初始化集群失败") else: prefix_msg = _("删除集群失败") else: if log.oper_type in initial_oper_list: prefix_msg = _("初始化节点失败") else: prefix_msg = _("删除节点失败") push_sentry(log, prefix_msg) if ((log_type == "NodeUpdateLog") and (log.status == models.CommonStatus.Normal) and (log.oper_type not in [models.NodeOperType.NodeRemove]) and ('K8S' in log.cluster_id)): return log.id, log_type
def delete_cluster_node_polling(new_log): log_type, log_id = new_log if not (log_type and log_id): return model = models.log_factory(log_type) log = model.objects.filter(id=log_id).last() end_time = datetime.now() + POLLING_TIMEOUT while not log.is_finished and log.is_polling: time.sleep(POLLING_INTERVAL_SECONDS) if datetime.now() > end_time: break try: log = _polling_once(model, log) except Exception as err: logger.exception(err) model.objects.filter(pk=log_id).update(is_polling=False)
def polling_initial_task(log_type, log_pk): """轮训初始化检测 """ model = models.log_factory(log_type) if not model: logger.error("log not found for type: %s", log_type) return log = model.objects.filter(pk=log_pk).last() if not log: logger.error("log not found for pk: %s", log_pk) return if log.is_finished: logger.info("log[%s] has been finished", log_pk) return if log.is_polling: logger.warning("log[%s] is polling", log_pk) else: log.is_polling = True log.save() end_time = datetime.now() + POLLING_TIMEOUT while not log.is_finished and log.is_polling: time.sleep(POLLING_INTERVAL_SECONDS) if datetime.now() > end_time: break try: log = _polling_initial_once(model, log) except Exception as err: logger.exception(err) return model.objects.filter(pk=log_pk).update(is_polling=False) if log.status in [models.CommonStatus.InitialCheckFailed]: push_sentry(log, _("前置检查失败")) # 更改node或集群状态 update_node_cluster_check_status(log, log_type) return return log_pk, log_type
def delete_cluster_node(new_log): log_type, log_id = new_log if not (log_type and log_id): return log_type, log_id model = models.log_factory(log_type) log = model.objects.filter(id=log_id).last() params = json.loads(log.params) # 触发bcs任务 model.objects.filter(id=log.id) bcs_client = BCSClient( log.token, log.project_id, log.cluster_id, None ) resp = bcs_client.delete_cluster_node( params.get("kind_name"), log.operator, list(params.get("nodes", {}).keys()) ) if not resp.get("result"): log.is_finished = True log.is_polling = False log.status = models.CommonStatus.RemoveFailed log.log = json.dumps({ "state": "remove_failed", "node_tasks": [{ "state": "FAILURE", "name": resp.get("message") }] }) log.save() result = paas_cc.update_node( log.token, log.project_id, params["node_id"], {"status": models.CommonStatus.RemoveFailed} ) if result.get("code") != ErrorCode.NoError: return None, None return None, None data = resp.get("data") or {} taskid = data.get("taskID") log.task_id = taskid log.is_polling = True log.save() return log_type, log.id
def _polling_bke_status(pk, log_type="NodeUpdateLog"): """ """ model = models.log_factory(log_type) log = model.objects.filter(pk=pk).last() end_time = datetime.now() + BKE_POLLING_TIMEOUT status = models.CommonStatus.Normal message = "" while not log.is_finished and log.is_polling: if datetime.now() > end_time: break try: bke_client = get_bcs_client(log.project_id, log.cluster_id, log.token) bke_client.get_cluster_credential() status = models.CommonStatus.Normal break except Exception as err: status = models.NodeStatus.BkeFailed message = "%s" % err bke_log_save(log, log_type, status, message=message) return
def get_task_record(self) -> Optional[ModelLogRecord]: """获取task记录""" params = self.params # 任务类型: cluster/node model_type = params["model_type"] # 任务记录的ID task_record_id = params["pk"] task_model = models.log_factory(model_type) if not task_model: logger.error(f'not found {model_type} task') return # 获取记录 try: record = task_model.objects.get(pk=task_record_id) except task_model.DoesNotExist: logger.error(f'not found task: {task_record_id}') return # 判断任务是否结束 if record.is_finished: logger.info(f'record: {task_record_id} has been finished') return record return record
def polling_task(log_type, log_pk): model = models.log_factory(log_type) if not model: logger.error("log not found for type: %s", log_type) return log = model.objects.filter(pk=log_pk).last() if not log: logger.error("log not found for pk: %s", log_pk) return if not log.task_id: logger.error("task id is null for pk: %s", log_pk) return if log.is_finished: logger.info("log[%s] has been finished", log_pk) return end_time = datetime.now() + POLLING_TIMEOUT while not log.is_finished and log.is_polling: time.sleep(POLLING_INTERVAL_SECONDS) if datetime.now() > end_time: break try: log = _polling_once(model, log) except Exception as err: logger.exception("query task failed, detail: %s" % err) # 超时更新状态 if not log.is_finished: log.is_finished = True log.is_polling = False log.status = log_status(log) log.save() # 更新配置中心状态 update_status(log_type, log)
def exec_bcs_task(old_log, request=None): """执行bcs创建集群任务 """ if not old_log: return # 判断是否可以执行后续 log_pk, log_type = old_log model = models.log_factory(log_type) if not model: logger.error("log not found for type: %s", log_type) return log = model.objects.filter(pk=log_pk).last() if not log: logger.error("log not found for pk: %s", log_pk) return # 组装参数 user_token = log.token username = log.operator if request: user_token = request.user.token.access_token username = request.user.username # 解析参数 try: params = json.loads(log.params) except Exception: params = {} new_log = models.ClusterInstallLog.objects.create( project_id=log.project_id, cluster_id=log.cluster_id, token=user_token, status=models.CommonStatus.Initializing, params=log.params, operator=username ) client = BCSClient( user_token, params.get("project_id"), params.get("cluster_id"), None ) rsp = client.create_cluster( params.get("kind_name"), username, params.get("master_ips", []), data={ "modules": params.get("module_list", ""), "appID": constants.BCS_APP_ID, "needNat": params.get("need_nat", True), } ) if rsp.get("code") != ErrorCode.NoError: new_log.is_finished = True new_log.is_polling = False new_log.status = models.CommonStatus.InitialFailed # 记录错误信息到log new_log.log = json.dumps({ "state": "FAILURE", "node_tasks": [{"state": "FAILURE", "name": rsp.get("message")}] }) new_log.save() result = paas_cc.update_cluster( user_token, params.get("project_id"), params.get("cluster_id"), {"status": models.CommonStatus.InitialFailed} ) # TODO: 怎样保证写入不成功时,可以再次写入 if result.get("code") != ErrorCode.NoError: return push_sentry(new_log, _("初始化集群失败")) return data = rsp.get("data") or {} new_log.task_id = data.get("taskID") new_log.save() try: cc.host_standard_property( username, params.get("master_ips", []), bak_operator_flag=True ) except Exception as err: logger.error("Request cc error, detail: %s" % err) # 触发新的任务 return new_log.id