def run(self): log.logger.info("Starting SchedulerHelper") exception_count = 0 exception_begin = datetime.now() max_exception_count = 5 max_exception_seconds = 3 * 60 while True: try: self.check_scheduler_result() time.sleep(30) except Exception as e: exception_count += 1 if exception_count == 1: exception_begin = datetime.now() exception_duration = (datetime.now() - exception_begin).total_seconds() alert_msg = "if get exception {} times in {} seconds, " \ "the SchedulerHelper will exit, current:{} times/{} seconds, exception:{}-{}". \ format(max_exception_count, max_exception_seconds, exception_count, exception_duration, type(e), str(e)) if exception_count >= max_exception_count and exception_duration >= max_exception_seconds: alert_msg = "scheduler exit, do somthing {}".format( alert_msg) log.logger.error(alert_msg) process_utils.Alert(alert_msg) return else: log.logger.error(alert_msg) process_utils.Alert(alert_msg) time.sleep(10) log.logger.info("Quit the SchedulerHelper")
def run(self): log.logger.info("Starting the DependencyScheduler") producer = kafka_utils.setup_kafka(config.G_Conf.Common.Broker) def gen_obj(d): return kafka_utils.TaskOverMsg(d['instance_id'], d['task_id'], d['status'], d['execute_date']) exception_count = 0 exception_begin = datetime.now() max_exception_count = 5 max_exception_seconds = 3 * 60 while True: try: # for msg in kafka_utils.TOPIC_TASK_RESULT: if producer.llen(kafka_utils.TOPIC_TASK_RESULT): msg = producer.rpop( kafka_utils.TOPIC_TASK_RESULT ) #这里可以用StrictRedis实例的brpop改善,去掉llen轮询。 msg_obj = gen_obj(json.loads(msg)) log.logger.info("get task result:{}".format(msg_obj)) #kafka_utils.scheduler_consumer.commit() # the worker push msg only success met_task = models.TaskDependency( ).downstream_met_dependency( task_id=msg_obj.task_id, execute_date=msg_obj.execute_date) if len(met_task) > 0: self.run_task(msg_obj.execute_date, met_task) else: log.logger.info("begin fetch waiting_dep task list") self.run_task() log.logger.info("end fetch waiting_dep task list ") time.sleep(10) except Exception as e: exception_count += 1 if exception_count == 1: exception_begin = datetime.now() exception_duration = (datetime.now() - exception_begin).total_seconds() alert_msg = "if get exception {} times in {} seconds, " \ "the DependencyScheduler will exit, current:{} times/{} seconds, exception:{}-{}". \ format(max_exception_count, max_exception_seconds, exception_count, exception_duration, type(e), str(e)) if exception_count >= max_exception_count and exception_duration >= max_exception_seconds: alert_msg = "scheduler exit, do somthing {}".format( alert_msg) log.logger.error(alert_msg) process_utils.Alert(alert_msg) return else: log.logger.error(alert_msg) process_utils.Alert(alert_msg) time.sleep(10) log.logger.info("Quit the DependencyScheduler")
def run(self): """ try to sync file from other node :return: """ log.logger.info("Starting the SyncFileWorker") def gen_obj(d): return kafka_utils.SyncFileMsg(d['file_id']) waiting_seconds = 20 while True: try: for msg in kafka_utils.sync_file_consumer: msg_obj = json.loads(msg.value, object_hook=gen_obj) log.logger.info( "SyncFileWorker file_info:{}".format(msg_obj)) sync_file = models.LoaderResult().get_dumped_file_by_id( msg_obj.file_id) #kafka_utils.sync_file_consumer.commit() if sync_file is None: continue if os.path.exists(sync_file.dumped_file): continue tmp_file = "{}_rsync_tmp".format(sync_file.dumped_file) command = sync_file.gen_command(tmp_file) dumped_path = os.path.dirname(sync_file.dumped_file) if not os.path.exists(dumped_path): os.makedirs(dumped_path) err = None try: subprocess.check_output(command, shell=True) os.rename(tmp_file, sync_file.dumped_file) except subprocess.CalledProcessError as e: err = e if err is not None: alert_msg = "SyncFileWorker run command [{}] with err: [{}]".format( command, err) log.logger.error(alert_msg) process_utils.Alert(alert_msg) else: log.logger.info( "SyncFileWorker sync_file_consumer msg is empyt, waiting {} seconds try again" .format(waiting_seconds)) time.sleep(waiting_seconds) except Exception as e: alert_msg = "SyncFileWorker: {} {} ".format(type(e), e.message) log.logger.error(alert_msg) process_utils.Alert(alert_msg)
def run(self): log.logger.info("Starting the DependencyScheduler") def gen_obj(d): return kafka_utils.TaskOverMsg(d['instance_id'], d['task_id'], d['status'], d['execute_date']) exception_count = 0 exception_begin = datetime.now() max_exception_count = 5 max_exception_seconds = 3 * 60 while True: try: for msg in kafka_utils.scheduler_consumer: msg_obj = json.loads(msg.value, object_hook=gen_obj) log.logger.info("get task result:{}".format(msg_obj)) #kafka_utils.scheduler_consumer.commit() # the worker push msg only success met_task = models.TaskDependency( ).downstream_met_dependency( task_id=msg_obj.task_id, execute_date=msg_obj.execute_date) if len(met_task) > 0: self.run_task(msg_obj.execute_date, met_task) else: log.logger.info("begin fetch waiting_dep task list") self.run_task() log.logger.info("end fetch waiting_dep task list ") except Exception as e: exception_count += 1 if exception_count == 1: exception_begin = datetime.now() exception_duration = (datetime.now() - exception_begin).total_seconds() alert_msg = "if get exception {} times in {} seconds, " \ "the DependencyScheduler will exit, current:{} times/{} seconds, exception:{}-{}". \ format(max_exception_count, max_exception_seconds, exception_count, exception_duration, type(e), e.message) if exception_count >= max_exception_count and exception_duration >= max_exception_seconds: alert_msg = "scheduler exit, do somthing {}".format( alert_msg) log.logger.error(alert_msg) process_utils.Alert(alert_msg) return else: log.logger.error(alert_msg) process_utils.Alert(alert_msg) time.sleep(10) log.logger.info("Quit the DependencyScheduler")
def run(self): log.logger.info("Starting the MainScheduler") kafka_utils.setup_kafka(config.G_Conf.Common.Broker) exception_count = 0 exception_begin = datetime.now() max_exception_count = 5 max_exception_seconds = 3 * 60 while True: try: # check_sub_threading self.check_sub_threading() # get cron execute_timer = self.getMatchedCronTask() for timer in execute_timer: self.run_timer(timer) if timer.task_id in self.crons_conf \ and self.crons_conf[timer.task_id].cron_type != State.CRON_SINGLE: self.addTimer(self.crons_conf[timer.task_id]) # retry timeout worker self.retryZombieInstance() waiting = self._next_wait() time.sleep(waiting) exception_count = 0 except Exception as e: exception_count += 1 if exception_count == 1: exception_begin = datetime.now() exception_duration = (datetime.now() - exception_begin).total_seconds() alert_msg = "if get exception {} times in {} seconds, " \ "the MainScheduler will exit, current:{} times/{} seconds, exception:{}-{}".\ format(max_exception_count, max_exception_seconds, exception_count, exception_duration, type(e), e.message) if exception_count >= max_exception_count and exception_duration >= max_exception_seconds: alert_msg = "scheduler exit, do somthing {}".format( alert_msg) log.logger.error(alert_msg) process_utils.Alert(alert_msg) sys.exit(1) else: log.logger.error(alert_msg) process_utils.Alert(alert_msg) time.sleep(10) log.logger.info("End the scheduler, exit main loop")
def onSlave(self, err_msg): self.master_lock_failed = 0 self.status = SLAVE msg = "{}, switch to slave, waiting {} seconds try master again".format(err_msg, self.on_slave_wait) log.logger.info(msg) process_utils.Alert(msg) self.terminate() time.sleep(self.on_slave_wait)
def update_stat(self, etl_day, session=None): """ update_stat :param etl_day: YYYY-MM-DD :param session: :return (bool): is_all_finished """ all_finished = True stat = None notice_msg = None try: stat = session.query(StatResult).filter( StatResult.etl_day == etl_day).one() except NoResultFound: self.init_stat(etl_day) stat = session.query(StatResult).filter( StatResult.etl_day == etl_day).one() # extract statuss if stat.extract_status == State.SUCCESS: if stat.extract_notice == 0: stat.extract_notice = 1 notice_msg = " all rules extract success, etl_day: {}, total_count: {}"\ .format(stat.etl_day, stat.extract_success_count) else: all_finished = False stat.extract_finish_count, stat.extract_success_count = TaskInstance().\ get_instance_result(etl_day, State.TASK_EXTRACT, session=session) if stat.extract_success_count == stat.extract_total_count: stat.extract_status = State.SUCCESS # job status if stat.job_status == State.SUCCESS: if stat.job_notice == 0: stat.job_notice = 1 notice_msg = " all scheduler job execute success, etl_day:{}, total_count: {}".\ format(stat.etl_day, stat.job_total_count) else: all_finished = False stat.job_finish_count, stat.job_success_count = TaskInstance(). \ get_instance_result(etl_day, State.TASK_JOB, session=session) if stat.job_total_count == stat.job_success_count: stat.job_status = State.SUCCESS session.merge(stat) session.commit() # create compat job if stat.extract_status == State.SUCCESS: TaskInstance().create_fake_task_instance(etl_day, session=session) if notice_msg is not None: process_utils.Alert(notice_msg) return all_finished
def onMaster(self, err_msg): self.terminate() self.master_lock_failed = 0 self.status = MASTER cmd = ["python",] for v in sys.argv: if v != "--ha" and v != "-s": cmd.append(v) full_cmd = " ".join(cmd) msg = "{}, switch to master, Running Command [{}]".format(err_msg, full_cmd) process_utils.Alert(msg) log.logger.info(msg) self.process = subprocess.Popen( full_cmd, shell=True )
def run(self): print("Starting the Worker") producer = kafka_utils.setup_kafka(config.G_Conf.Common.Broker) """ sync_file_worker = SyncFileWorker() sync_file_worker.setDaemon(True) sync_file_worker.start() """ waiting_seconds = self.config["orphaned_node_wait_seconds"] rejoin_times = self.config["orphaned_node_rejoin_times"] node_info = "cflow worker node [{}-{}]".format( socket.gethostname(), process_utils.getServerIp()) already_join_times = 1 def gen_obj(d): print(d) return kafka_utils.TaskBeginMsg(d['instance_id'], d['task_id'], d['execute_date']) # main logic while True: log.logger.info("{} join cluster ".format(node_info)) try: instance_msg = None while producer.llen(self.task_name): msg = producer.rpop( self.task_name) #这里可以用StrictRedis实例的brpop改善,去掉llen轮询。 # print(msg) try: instance_msg = gen_obj(json.loads(msg)) # parallelism limit self.block_to_run(instance_msg) err, instance = self.prepare_to_run( instance_msg.instance_id) # if the instance run not success, scheduler will rerun #kafka_utils.worker_consumer.commit() if err is not None or instance is None: log.logger.error( "run instace {}, err_msg {}".format( instance_msg, err)) process_utils.ref_counter.unref() continue # start a subprocess run the instance 正式执行 self.run_single_task(instance) except Exception as e: msg = "{} run instance {}, execption:{},{}".format( node_info, instance_msg, type(e), str(e)) log.logger.error(msg) process_utils.Alert(msg) else: alert_msg = "{} kafka msg is empty, quit then waiting msg ".format( node_info) log.logger.info(alert_msg) except Exception as e: alert_msg = "{}, get exception {}, {} ".format( node_info, type(e), str(e)) process_utils.Alert(alert_msg) # worker_consumer is blocking, if come here should try join cluster if rejoin_times < 0 or (already_join_times < rejoin_times): alert_msg = "{} break from the cluster, waiting {} seconds then join again ".format( node_info, waiting_seconds) log.logger.info(alert_msg) #process_utils.Alert(alert_msg) time.sleep(waiting_seconds) already_join_times += 1 else: alert_msg = "{} exit, reach the max join cluster times ".format( node_info) process_utils.Alert(alert_msg) break log.logger.info("Quit the Worker")
def run(self): #执行次数 running_times = 0 msg = None try: while running_times <= self.retry: task_runner = BashTaskRunner(self.instance) self.begin_time = time.time() self.instance.worker_retry = running_times #提交运行状态 should_run = self.instance.start_running( retry=(True if running_times > 0 else False)) if should_run is not None: log.logger.info("{}".format(should_run)) msg = None break ret = self.inner_run(task_runner, running_times) if ret is None: self.instance.stop_running(State.SUCCESS) kafka_utils.PushMsgWithRetry( kafka_utils.TOPIC_TASK_RESULT, kafka_utils.TaskOverMsg( instance_id=self.instance.id, task_id=self.instance.task_id, status=State.SUCCESS, execute_date=self.instance.etl_day)) msg = None break else: msg = "the {} times running:{}".format(running_times, ret) if self.instance.status == State.KILLED: # if instance is killd, should stop running break elif self.instance.status == State.TIMEOUT: self.instance.stop_running(State.TIMEOUT) else: self.instance.stop_running(State.FAILED) if running_times < self.retry: msg = "{}, after {} seconds will try the {} times ".format( msg, self.step_seconds * (running_times + 1), running_times + 1) log.logger.error(msg) running_times += 1 if running_times <= self.retry: time.sleep(self.step_seconds * running_times) else: msg = "reach the max retry times {} with err:{}, stop running".format( self.retry, msg) log.logger.info(msg) except Exception as e: msg = "get Exception {}.{}".format(type(e), str(e)) log.logger.error(msg) finally: process_utils.ref_counter.unref() if msg is not None: keeper = "unknown" log.logger.error("run {}, err: {}".format(self.instance, msg)) if self.instance.task_type == State.TASK_JOB: job_list = models.TaskDefine().get_job_by_task_id( [self.instance.task_id]) if len(job_list) > 0: keeper = "{}({})".format(self.instance.task_id, job_list[0].keeper) else: # should not come here keeper = "{}".format(self.instance.task_id) elif self.instance.task_type == State.TASK_EXTRACT: keeper = "{}(rule_id:{})".format(self.instance.task_id, self.instance.sub_task_id) elif self.instance.task_type == State.TASK_CRON: keeper = "{}(定时任务)".format(self.instance.task_id) else: pass msg = "\nTask: {} \nError: {} \nContext: {}".format( keeper, msg, self.instance) process_utils.Alert(msg) return