def schedule(self, cron=None): """ 安排执行计划 """ flow_cron = get_project_info(self.prj_name, "cron." + str(self.flowId).strip()) if flow_cron: cron = flow_cron # prj_cron.get(self.prj_name + self.flowId, None) if check_cron(cron): data = { 'session.id': self.cookies_fetcher.get_session_id(), 'ajax': u'scheduleCronFlow', 'projectName': self.prj_name, 'flow': self.flowId, 'failureAction': 'finishPossible', # finishCurrent, cancelImmediatel 'cronExpression': cron } response = requests.post( azkaban_url + '/schedule', data=data ) rs = str(response.content, 'utf-8') if response.status_code != 200 or 'error' in rs: logger.info("秒 分 时 日 月 周 data: \n%s", data) logger.error(rs) logger.error("{0}设置执行计划失败".format(self.prj_name)) return False else: logger.info("{0} flow:{1}设置执行计划成功".format(self.prj_name, self.flowId)) return True else: # logger.error("{0}设置执行计划失败,请设置正确的cron时间格式.ERROR for:{1}".format(self.prj_name, cron)) return False
def fetch_schedule(self): """ 获取执行计划 """ prj_id = get_projects().get(self.prj_name) response = requests.get( azkaban_url + '/schedule', params={ 'session.id': self.cookies_fetcher.get_session_id(), 'ajax': 'fetchSchedule', 'projectId': prj_id, 'flowId': self.flowId } ) if response.status_code != 200: logger.info(str(response.content, 'utf-8')) raise Exception("{0}获取执行计划列表失败".format(self.prj_name)) else: # logger.debug(str(response.content, 'utf-8')) try: schd_id = response.json()['schedule']['scheduleId'] logger.debug("{0} flow:{1}获取执行计划ID是{2}".format(self.prj_name, self.flowId, schd_id)) return schd_id except Exception as e: logger.debug(str(e)) logger.info("{0} flow:{1} 没有设置执行计划".format(self.prj_name, self.flowId)) return None
def handle_timeout(self): """ 设置超时后先杀死进程然后恢复执行,循环监控 :return: """ while True: logger.info('checking to execute flow {flow}, {exec_id}'.format(flow=self.flowId, exec_id=self.exec_id)) result = self.get_flow_exec_info() self.refresh_flow_execution() start_time = result['startTime'] start_time /= 1000 if result['status'] == 'KILLED': logger.info("{execid} has been killed.".format(execid=self.exec_id)) break elif result['status'] == 'SUCCEEDED': logger.info("{execid} has been SUCCEEDED.".format(execid=self.exec_id)) break elif result['status'] == 'FAILED': logger.info("{execid} has been FAILED.".format(execid=self.exec_id)) break else: if start_time > 0 and int(time.time()) - start_time > 60 * self.flow_timeout \ and result['endTime'] == -1: logger.info('reached timeout threshold \n') self.cancel() time.sleep(60) self.resume_flow() time.sleep(check_interval)
def cancel(self): """取消执行""" target = '%s/executor?ajax=cancelFlow&execid=%s' % (azkaban_url, self.exec_id) resp = requests.get(target, cookies=self.cookies_fetcher.get_cookies()) if resp.status_code != 200: logger.info(str(resp.content, 'utf-8'))
def execute(self): """ 执行工作流 :return: 返回执行id """ logger.info('开始执行flow {flow}'.format(flow=self.flowId)) url = '{azkaban_url}/executor?ajax=executeFlow&project={project}&flow={flow}' + self.disabled + self.flow_override url = url.format( azkaban_url=azkaban_url, project=self.prj_name, flow=self.flowId) # logger.info("执行url:" + url) flows_resp = requests.get( url, cookies=self.cookies_fetcher.get_cookies() ) rs = str(flows_resp.content, 'utf-8') if flows_resp.status_code != 200 or 'error' in rs: logger.error(rs) raise Exception('执行{flow} 报错'.format(flow=self.flowId)) else: # logger.info(rs) exec_id = json.loads(rs)['execid'] logger.info(('开始执行{flow},execid是{exec_id}'.format(flow=self.flowId, exec_id=exec_id))) return FlowExecution(exec_id, self.cookies_fetcher)
def upload_zip(self, zip_file): """上传zip文件""" if self.download_zip(): # 备份文件成功 logger.info("备份{0}项目文件成功".format(self.name)) files = { 'file': (os.path.basename(zip_file), open(zip_file, 'rb'), 'application/zip') } upload_data = { 'project': self.name, 'ajax': 'upload', } resp = requests.post( "{azkaban_url}/manager".format(azkaban_url=azkaban_url), data=upload_data, cookies=self.cookies_fetcher.get_cookies(), files=files) if resp.status_code != 200: logger.error(str(resp.content, 'utf-8')) raise Exception('上传ZIP文件失败:{name} '.format(name=zip_file)) logger.info("上传ZIP文件完成:" + self.name) else: raise Exception('项目文件{name}:文件备份失败'.format(name=self.name))
def download_zip(self): """下载zip文件""" if self.crt_flag: url = "{azkaban_url}/manager?session.id={id}&project={project}&download=True".format(id=self.cookies_fetcher.get_session_id(), azkaban_url=azkaban_url, project=self.name) # headers = { # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit \ # /537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image \ # /webp,image/apng,*/*;q=0.8', # 'Accept-Encoding': 'gzip, deflate', # 'Accept-Language': 'zh-CN, zh; q=0.9', # 'Referer': url, # 'Upgrade-Insecure-Requests': '1', # 'azkaban_url': azkaban_url, # } resp = requests.get(url, stream=True) # headers=headers, now_time = get_current_timekey() backup_dt_dir = os.path.join(backup_path, now_time[0:8]) if not (os.path.exists(backup_dt_dir)): os.makedirs(backup_dt_dir) file_path = os.path.join(backup_dt_dir, self.name + "_" + now_time + '.zip') with open(file_path, "wb") as code: code.write(resp.content) if resp.status_code != 200: raise Exception('下载{project}项目文件失败'.format(project=self.name)) logger.info("下载ZIP文件完成" + self.name + ": " + file_path) return True else: logger.info("项目不存在不能下载") return True
def set_sla(self, schedule_id, email, settings): """ 为SLA设置执行计划 基本不用 :param schedule_id: Schedule ID. :param email: Array of emails to receive notifications. :param settings: Array of comma delimited strings of SLA settings consisting of: + job name - blank for full workflow + rule - SUCCESS or FINISH + duration - specified in hh:mm + email action - bool + kill action - bool """ logger.debug('Setting SLA for schedule Id %s.', schedule_id) request_data = { 'ajax': 'setSla', 'scheduleId': schedule_id, 'slaEmails': ','.join(email), } for i, setting in enumerate(settings): request_data['settings[%s]' % (i, )] = setting res = extract_json( self._request( method='POST', endpoint='schedule', data=request_data, )) logger.info('Set SLAs for schedule Id %s.', schedule_id) return res
def resume_flow(self): """恢复执行""" target = '%s/executor?ajax=executeFlow&project=%s&flow=%s&disabled=%s' % ( azkaban_url, self.prj_name.name, self.flowId, get_str_set(self.job_status_dict.get('SUCCEEDED'))) resp = requests.get(target, cookies=self.cookies_fetcher.get_cookies()) contents = resp.content new_exec_id = json.loads(contents)['execid'] logger.info('old exec_id {old} to new one {new}'.format(old=self.exec_id, new=new_exec_id)) self.exec_id = new_exec_id
def del_prj(self): """删除项目""" if len(self.fetch_flow_schedule()) > 0: logger.info("该项目有执行计划,不能删除") if self.crt_flag and self.download_zip(): resp = requests.get("{azkaban_url}/manager?delete=true&project={name}".format(azkaban_url=azkaban_url, name=self.name), cookies=self.cookies_fetcher.get_cookies()) if resp.status_code != 200: raise Exception('Error happened when delete project {project} to azkaban'.format(project=self.name)) logger.info('删除Project:' + self.name) return self
def fetch_job(self): """获取工作流的job""" flows_resp = requests.get( '{azkaban_url}/manager?ajax=fetchflowgraph&project={project}&flow={flow}'.format(azkaban_url=azkaban_url, project=self.prj_name, flow=self.flowId), cookies=self.cookies_fetcher.get_cookies()) if flows_resp.status_code != 200: raise Exception('Error happened when fetch job from {0} in {1}'.format(self.flowId, self.prj_name)) jobs = json.loads(str(flows_resp.content, 'utf-8'))['nodes'] logger.info(jobs) return jobs
def refresh_flow_execution(self): """ 刷新执行ID的状态. :return: """ result = self.get_flow_exec_info() for dd in result['nodes']: cu = self.job_status_dict.get(dd['status'], set()) cu.add(dd['id']) self.job_status_dict[dd['status']] = cu for k, v in self.job_status_dict.items(): logger.info('%s status: %s : %d/%d \n' % (get_current_timekey(), k, len(v), len(result['nodes'])))
def schedule_flows(self, cron=None, flows=None): """将所有的工作流列入执行计划""" all_flows = self.fetch_flow() if cron is None: # try: cron = get_project_info(prj_nm=self.name, key_nm="cron." + self.name) # prj_cron.get(self.name, None) if cron is None and get_project_info(prj_nm=self.name, key_nm="cron") is None: logger.error(self.name + "的 system.properties中没有配置定时器cron请配置") return if flows is None: flows = all_flows for f in all_flows: if f in flows: logger.info("设置项目{0}的工作流{1}执行计划".format(self.name, f)) flow = Flow(self.name, f, self.cookies_fetcher) flow.schedule(cron)
def create_prj(self): """创建项目,先看看有木有存在""" if self.crt_flag: logger.info("项目已经存在不能创建") return self create_data = { 'name': self.name, 'description': self.description } resp = requests.post("{azkaban_url}/manager?action=create".format(azkaban_url=azkaban_url), data=create_data, cookies=self.cookies_fetcher.get_cookies()) if resp.status_code != 200: raise Exception('项目 {project} 创建失败'.format(project=self.name)) # logger.info(resp.content) logger.info('项目 {project} 创建状态 : {status}'.format(project=self.name, status=json.loads(str(resp.content, 'utf-8'))['status'])) return self
def get_properties(self): try: pro_file = open(self.fileName, 'Ur') for line in pro_file.readlines(): line = line.strip().replace('\n', '') if line.find("#") != -1: line = line[0:line.find('#')] if line.find('=') > 0: strs = line.split('=') strs[1] = line[len(strs[0]) + 1:] self.__get_dict(strs[0].strip(), self.properties, strs[1].strip()) except Exception as e: logger.info("本地的本质文件不规范,注意出现了com key值 就不能出现 com.hy的key值") raise Exception(e) else: pro_file.close() return self.properties
def check_cron(cron): """ 检查cron时间格式是否合规 :param cron: :return: """ if cron and " " in cron: try: pt = Cron(cron) logger.info("cron格式校验通过,下次执行时间: " + str(pt.get_next())) return True except Exception as e: logger.error(str(e)) logger.error("cron不能为空或者格式不对,你输入的是{0}".format(cron)) return False else: logger.warning("cron不能为空或者格式不对,你输入的是{0}".format(cron)) return False
def unscheduled(self): """ 取消执行计划 """ schd_id = self.fetch_schedule() if schd_id: data = { u'session.id': self.cookies_fetcher.get_session_id(), u'action': u'removeSched', u'scheduleId': schd_id } response = requests.post(azkaban_url + '/schedule', data=data) if response.status_code != 200: logger.info("Request data: \n%s", data) logger.error(str(response.content, 'utf-8')) raise Exception("{0} 取消执行计划失败".format(self.prj_name)) else: logger.info("{0} flow:{1} 执行计划id={2}已经被取消".format( self.prj_name, self.flowId, schd_id)) return True
def get_sla(self, schedule_id): """ 获取SLA设置信息. 基本不用 :param schedule_id: Schedule Id - obtainable from get_schedule """ logger.debug('Retrieving SLA for schedule ID %s.', schedule_id) res = extract_json( self._request( method='GET', endpoint='schedule', params={ 'ajax': 'slaInfo', 'scheduleId': schedule_id }, )) logger.info('Retrieved SLA for schedule ID %s.', schedule_id) if 'settings' not in res: raise Exception('Failed to get SLA; check that an SLA exists.') return res
def cancel_execution(self, exec_id): """ 取消执行execution. :param exec_id: Execution ID. """ logger.debug('Cancelling execution %s.', exec_id) res = extract_json( self._request( method='GET', endpoint='executor', params={ 'execid': exec_id, 'ajax': 'cancelFlow', }, )) if 'error' in res: raise Exception('Execution %s is not running.', exec_id) else: logger.info('Execution %s cancelled.', exec_id) return
def exec_shell(shell, logs_print=True): logger.info("执行shell:" + shell) try: proc = subprocess.Popen(shell, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # subprocess.STDOUT pre_line = '' error_line = '' while proc.poll() is None: line = proc.stdout.readline().strip().decode('utf-8') if line and len(line) > 2: pre_line = line if 'error' in line.lower(): error_line = line if logs_print: logger.info(line) # outs, errs = proc.communicate(timeout=15) # res是一个对象,需要读取res对象的stderr\strout\stdin的属性,才可获取值 # 如果:err有值,即表示命令执行报错,即:stdout就为空 if proc.returncode == 0: logger.info('shell 执行成功') return True else: raise Exception(error_line + '\n' + pre_line) except Exception as e: logger.error('shell执行失败:' + str(e)) raise Exception("shell执行失败:" + str(e))
def restart_azkaban(azkaban_path="/opt/softs/azkaban"): """ 启动或重启azkaban :param azkaban_path: azkaban安装路径(目录下有web-server和exec-server目录) :return: """ global exec_server, web_server os.chdir(azkaban_path) # print(os.getcwd()) dirs = os.listdir(azkaban_path) for i in dirs: if 'web-' in i.lower() or '-web' in i.lower(): web_server = os.path.join(azkaban_path, i) continue if 'exec-' in i.lower() or '-exec' in i.lower(): exec_server = os.path.join(azkaban_path, i) continue os.chdir(exec_server) tp = os.popen("jps") # 判断是否已经启动 jps = tp.readlines() tp.close() azkaban_exec = False azkaban_web = False for i in jps: tp = i.strip().split(" ") if tp[1].strip().upper() == 'AzkabanExecutorServer'.upper(): azkaban_exec = True if tp[1].strip().upper() == 'AzkabanWebServer'.upper(): azkaban_web = True if azkaban_exec: exec_shell("bin/shutdown-exec.sh") exec_shell("bin/start-exec.sh") time.sleep(10) active_executor() os.chdir(web_server) time.sleep(10) logger.info(os.getcwd()) if azkaban_web: exec_shell("bin/shutdown-web.sh") exec_shell("bin/start-web.sh")
def active_executor(hosts=None, port=12321): """激活各节点你的Executor""" unactive_host_list = get_executor(active=0) if len(unactive_host_list) < 1: logger.info("所有节点都已经激活") return if hosts: if hosts not in unactive_host_list: logger.info("{0}节点没有部署成功,请确认是否部署成功和hostname".format(hosts)) return url = "http://{0}:{1}/executor?action=activate".format(hosts, port) try: rs = requests.get(url) if str(rs.content, 'utf-8') == "{\"status\":\"success\"}": logger.info("Executor : {0} 激活成功".format(hosts)) else: logger.error("激活Executor失败,请确认Executor正确启动") except Exception as e: logger.error("激活Executor失败,请确认正确的ip和port" + str(e)) else: rs = get_executor(active=0, rstype="port") for i in rs.keys(): active_executor(i, rs[i])