Exemple #1
0
    def run(self):
        log.logger.info("Starting SchedulerHelper")
        exception_count = 0
        exception_begin = datetime.now()
        max_exception_count = 5
        max_exception_seconds = 3 * 60

        while True:
            try:
                self.check_scheduler_result()
                time.sleep(30)
            except Exception as e:
                exception_count += 1
                if exception_count == 1:
                    exception_begin = datetime.now()
                exception_duration = (datetime.now() -
                                      exception_begin).total_seconds()
                alert_msg = "if get exception {} times in {} seconds, " \
                            "the SchedulerHelper will exit, current:{} times/{} seconds, exception:{}-{}". \
                    format(max_exception_count, max_exception_seconds, exception_count, exception_duration, type(e), str(e))
                if exception_count >= max_exception_count and exception_duration >= max_exception_seconds:
                    alert_msg = "scheduler exit, do somthing {}".format(
                        alert_msg)
                    log.logger.error(alert_msg)
                    process_utils.Alert(alert_msg)
                    return
                else:
                    log.logger.error(alert_msg)
                    process_utils.Alert(alert_msg)
                time.sleep(10)
        log.logger.info("Quit the SchedulerHelper")
Exemple #2
0
    def run(self):
        log.logger.info("Starting the DependencyScheduler")
        producer = kafka_utils.setup_kafka(config.G_Conf.Common.Broker)

        def gen_obj(d):
            return kafka_utils.TaskOverMsg(d['instance_id'], d['task_id'],
                                           d['status'], d['execute_date'])

        exception_count = 0
        exception_begin = datetime.now()
        max_exception_count = 5
        max_exception_seconds = 3 * 60

        while True:
            try:
                # for msg in kafka_utils.TOPIC_TASK_RESULT:
                if producer.llen(kafka_utils.TOPIC_TASK_RESULT):
                    msg = producer.rpop(
                        kafka_utils.TOPIC_TASK_RESULT
                    )  #这里可以用StrictRedis实例的brpop改善,去掉llen轮询。
                    msg_obj = gen_obj(json.loads(msg))
                    log.logger.info("get task result:{}".format(msg_obj))
                    #kafka_utils.scheduler_consumer.commit()

                    # the worker push msg only success
                    met_task = models.TaskDependency(
                    ).downstream_met_dependency(
                        task_id=msg_obj.task_id,
                        execute_date=msg_obj.execute_date)
                    if len(met_task) > 0:
                        self.run_task(msg_obj.execute_date, met_task)
                else:
                    log.logger.info("begin fetch waiting_dep task list")
                    self.run_task()
                    log.logger.info("end fetch waiting_dep task list ")
                    time.sleep(10)
            except Exception as e:
                exception_count += 1
                if exception_count == 1:
                    exception_begin = datetime.now()
                exception_duration = (datetime.now() -
                                      exception_begin).total_seconds()
                alert_msg = "if get exception {} times in {} seconds, " \
                            "the DependencyScheduler will exit, current:{} times/{} seconds, exception:{}-{}". \
                    format(max_exception_count, max_exception_seconds, exception_count, exception_duration, type(e), str(e))
                if exception_count >= max_exception_count and exception_duration >= max_exception_seconds:
                    alert_msg = "scheduler exit, do somthing {}".format(
                        alert_msg)
                    log.logger.error(alert_msg)
                    process_utils.Alert(alert_msg)
                    return
                else:
                    log.logger.error(alert_msg)
                    process_utils.Alert(alert_msg)
                time.sleep(10)
        log.logger.info("Quit the DependencyScheduler")
Exemple #3
0
    def run(self):
        """
        try to sync file from other node
        :return:
        """
        log.logger.info("Starting the SyncFileWorker")

        def gen_obj(d):
            return kafka_utils.SyncFileMsg(d['file_id'])

        waiting_seconds = 20
        while True:
            try:
                for msg in kafka_utils.sync_file_consumer:
                    msg_obj = json.loads(msg.value, object_hook=gen_obj)
                    log.logger.info(
                        "SyncFileWorker file_info:{}".format(msg_obj))
                    sync_file = models.LoaderResult().get_dumped_file_by_id(
                        msg_obj.file_id)
                    #kafka_utils.sync_file_consumer.commit()
                    if sync_file is None:
                        continue

                    if os.path.exists(sync_file.dumped_file):
                        continue

                    tmp_file = "{}_rsync_tmp".format(sync_file.dumped_file)
                    command = sync_file.gen_command(tmp_file)
                    dumped_path = os.path.dirname(sync_file.dumped_file)
                    if not os.path.exists(dumped_path):
                        os.makedirs(dumped_path)
                    err = None
                    try:
                        subprocess.check_output(command, shell=True)
                        os.rename(tmp_file, sync_file.dumped_file)
                    except subprocess.CalledProcessError as e:
                        err = e
                    if err is not None:
                        alert_msg = "SyncFileWorker run command [{}] with err: [{}]".format(
                            command, err)
                        log.logger.error(alert_msg)
                        process_utils.Alert(alert_msg)
                else:
                    log.logger.info(
                        "SyncFileWorker sync_file_consumer msg is empyt, waiting {} seconds try again"
                        .format(waiting_seconds))
                    time.sleep(waiting_seconds)
            except Exception as e:
                alert_msg = "SyncFileWorker: {} {} ".format(type(e), e.message)
                log.logger.error(alert_msg)
                process_utils.Alert(alert_msg)
Exemple #4
0
    def run(self):
        log.logger.info("Starting the DependencyScheduler")

        def gen_obj(d):
            return kafka_utils.TaskOverMsg(d['instance_id'], d['task_id'],
                                           d['status'], d['execute_date'])

        exception_count = 0
        exception_begin = datetime.now()
        max_exception_count = 5
        max_exception_seconds = 3 * 60

        while True:
            try:
                for msg in kafka_utils.scheduler_consumer:
                    msg_obj = json.loads(msg.value, object_hook=gen_obj)
                    log.logger.info("get task result:{}".format(msg_obj))
                    #kafka_utils.scheduler_consumer.commit()

                    # the worker push msg only success
                    met_task = models.TaskDependency(
                    ).downstream_met_dependency(
                        task_id=msg_obj.task_id,
                        execute_date=msg_obj.execute_date)
                    if len(met_task) > 0:
                        self.run_task(msg_obj.execute_date, met_task)
                else:
                    log.logger.info("begin fetch waiting_dep task list")
                    self.run_task()
                    log.logger.info("end fetch waiting_dep task list ")
            except Exception as e:
                exception_count += 1
                if exception_count == 1:
                    exception_begin = datetime.now()
                exception_duration = (datetime.now() -
                                      exception_begin).total_seconds()
                alert_msg = "if get exception {} times in {} seconds, " \
                            "the DependencyScheduler will exit, current:{} times/{} seconds, exception:{}-{}". \
                    format(max_exception_count, max_exception_seconds, exception_count, exception_duration, type(e), e.message)
                if exception_count >= max_exception_count and exception_duration >= max_exception_seconds:
                    alert_msg = "scheduler exit, do somthing {}".format(
                        alert_msg)
                    log.logger.error(alert_msg)
                    process_utils.Alert(alert_msg)
                    return
                else:
                    log.logger.error(alert_msg)
                    process_utils.Alert(alert_msg)
                time.sleep(10)
        log.logger.info("Quit the DependencyScheduler")
Exemple #5
0
    def run(self):
        log.logger.info("Starting the MainScheduler")
        kafka_utils.setup_kafka(config.G_Conf.Common.Broker)

        exception_count = 0
        exception_begin = datetime.now()
        max_exception_count = 5
        max_exception_seconds = 3 * 60

        while True:
            try:
                # check_sub_threading
                self.check_sub_threading()

                # get cron
                execute_timer = self.getMatchedCronTask()
                for timer in execute_timer:
                    self.run_timer(timer)
                    if timer.task_id in self.crons_conf \
                            and self.crons_conf[timer.task_id].cron_type != State.CRON_SINGLE:
                        self.addTimer(self.crons_conf[timer.task_id])

                # retry timeout worker
                self.retryZombieInstance()
                waiting = self._next_wait()
                time.sleep(waiting)
                exception_count = 0
            except Exception as e:
                exception_count += 1
                if exception_count == 1:
                    exception_begin = datetime.now()
                exception_duration = (datetime.now() -
                                      exception_begin).total_seconds()
                alert_msg = "if get exception {} times in {} seconds, " \
                            "the MainScheduler will exit, current:{} times/{} seconds, exception:{}-{}".\
                    format(max_exception_count, max_exception_seconds, exception_count, exception_duration, type(e), e.message)
                if exception_count >= max_exception_count and exception_duration >= max_exception_seconds:
                    alert_msg = "scheduler exit, do somthing {}".format(
                        alert_msg)
                    log.logger.error(alert_msg)
                    process_utils.Alert(alert_msg)
                    sys.exit(1)
                else:
                    log.logger.error(alert_msg)
                    process_utils.Alert(alert_msg)

                time.sleep(10)

        log.logger.info("End the scheduler, exit main loop")
Exemple #6
0
 def onSlave(self, err_msg):
     self.master_lock_failed = 0
     self.status = SLAVE
     msg = "{}, switch to slave, waiting {} seconds try master again".format(err_msg, self.on_slave_wait)
     log.logger.info(msg)
     process_utils.Alert(msg)
     self.terminate()
     time.sleep(self.on_slave_wait)
Exemple #7
0
    def update_stat(self, etl_day, session=None):
        """
        update_stat
        :param etl_day: YYYY-MM-DD
        :param session:
        :return (bool): is_all_finished
        """
        all_finished = True
        stat = None
        notice_msg = None
        try:
            stat = session.query(StatResult).filter(
                StatResult.etl_day == etl_day).one()
        except NoResultFound:
            self.init_stat(etl_day)
            stat = session.query(StatResult).filter(
                StatResult.etl_day == etl_day).one()

        # extract statuss
        if stat.extract_status == State.SUCCESS:
            if stat.extract_notice == 0:
                stat.extract_notice = 1
                notice_msg = " all rules extract success, etl_day: {}, total_count: {}"\
                    .format(stat.etl_day, stat.extract_success_count)
        else:
            all_finished = False
            stat.extract_finish_count, stat.extract_success_count = TaskInstance().\
                get_instance_result(etl_day, State.TASK_EXTRACT, session=session)
            if stat.extract_success_count == stat.extract_total_count:
                stat.extract_status = State.SUCCESS

        # job status
        if stat.job_status == State.SUCCESS:
            if stat.job_notice == 0:
                stat.job_notice = 1
                notice_msg = " all scheduler job execute success, etl_day:{}, total_count: {}".\
                    format(stat.etl_day, stat.job_total_count)
        else:
            all_finished = False
            stat.job_finish_count, stat.job_success_count = TaskInstance(). \
                get_instance_result(etl_day, State.TASK_JOB, session=session)
            if stat.job_total_count == stat.job_success_count:
                stat.job_status = State.SUCCESS
        session.merge(stat)
        session.commit()

        # create compat job
        if stat.extract_status == State.SUCCESS:
            TaskInstance().create_fake_task_instance(etl_day, session=session)
        if notice_msg is not None:
            process_utils.Alert(notice_msg)
        return all_finished
Exemple #8
0
 def onMaster(self, err_msg):
     self.terminate()
     self.master_lock_failed = 0
     self.status = MASTER
     cmd = ["python",]
     for v in sys.argv:
         if v != "--ha" and v != "-s":
             cmd.append(v)
     full_cmd = " ".join(cmd)
     msg = "{}, switch to master, Running Command [{}]".format(err_msg, full_cmd)
     process_utils.Alert(msg)
     log.logger.info(msg)
     self.process = subprocess.Popen(
         full_cmd,
         shell=True
     )
Exemple #9
0
    def run(self):
        print("Starting the Worker")
        producer = kafka_utils.setup_kafka(config.G_Conf.Common.Broker)
        """
        sync_file_worker = SyncFileWorker()
        sync_file_worker.setDaemon(True)
        sync_file_worker.start()
        """
        waiting_seconds = self.config["orphaned_node_wait_seconds"]
        rejoin_times = self.config["orphaned_node_rejoin_times"]
        node_info = "cflow worker node [{}-{}]".format(
            socket.gethostname(), process_utils.getServerIp())

        already_join_times = 1

        def gen_obj(d):
            print(d)
            return kafka_utils.TaskBeginMsg(d['instance_id'], d['task_id'],
                                            d['execute_date'])

        # main logic
        while True:
            log.logger.info("{} join cluster ".format(node_info))
            try:
                instance_msg = None
                while producer.llen(self.task_name):
                    msg = producer.rpop(
                        self.task_name)  #这里可以用StrictRedis实例的brpop改善,去掉llen轮询。
                    # print(msg)
                    try:
                        instance_msg = gen_obj(json.loads(msg))

                        # parallelism limit
                        self.block_to_run(instance_msg)

                        err, instance = self.prepare_to_run(
                            instance_msg.instance_id)

                        # if the instance run not success, scheduler will rerun
                        #kafka_utils.worker_consumer.commit()
                        if err is not None or instance is None:
                            log.logger.error(
                                "run instace {}, err_msg {}".format(
                                    instance_msg, err))
                            process_utils.ref_counter.unref()
                            continue

                        # start a subprocess run the instance 正式执行
                        self.run_single_task(instance)
                    except Exception as e:
                        msg = "{} run instance {}, execption:{},{}".format(
                            node_info, instance_msg, type(e), str(e))
                        log.logger.error(msg)
                        process_utils.Alert(msg)
                else:
                    alert_msg = "{} kafka msg is empty, quit then waiting msg ".format(
                        node_info)
                    log.logger.info(alert_msg)
            except Exception as e:
                alert_msg = "{}, get exception  {}, {} ".format(
                    node_info, type(e), str(e))
                process_utils.Alert(alert_msg)

            # worker_consumer is blocking, if come here should try join cluster
            if rejoin_times < 0 or (already_join_times < rejoin_times):
                alert_msg = "{} break from the cluster, waiting {} seconds then join again ".format(
                    node_info, waiting_seconds)
                log.logger.info(alert_msg)
                #process_utils.Alert(alert_msg)
                time.sleep(waiting_seconds)
                already_join_times += 1
            else:
                alert_msg = "{} exit, reach the max join cluster times ".format(
                    node_info)
                process_utils.Alert(alert_msg)
                break

        log.logger.info("Quit the Worker")
Exemple #10
0
    def run(self):
        #执行次数
        running_times = 0
        msg = None
        try:
            while running_times <= self.retry:
                task_runner = BashTaskRunner(self.instance)
                self.begin_time = time.time()
                self.instance.worker_retry = running_times
                #提交运行状态
                should_run = self.instance.start_running(
                    retry=(True if running_times > 0 else False))
                if should_run is not None:
                    log.logger.info("{}".format(should_run))
                    msg = None
                    break
                ret = self.inner_run(task_runner, running_times)
                if ret is None:
                    self.instance.stop_running(State.SUCCESS)
                    kafka_utils.PushMsgWithRetry(
                        kafka_utils.TOPIC_TASK_RESULT,
                        kafka_utils.TaskOverMsg(
                            instance_id=self.instance.id,
                            task_id=self.instance.task_id,
                            status=State.SUCCESS,
                            execute_date=self.instance.etl_day))
                    msg = None
                    break
                else:
                    msg = "the {} times running:{}".format(running_times, ret)
                    if self.instance.status == State.KILLED:
                        # if instance is killd, should stop running
                        break
                    elif self.instance.status == State.TIMEOUT:
                        self.instance.stop_running(State.TIMEOUT)
                    else:
                        self.instance.stop_running(State.FAILED)

                    if running_times < self.retry:
                        msg = "{}, after {} seconds will try the {} times ".format(
                            msg, self.step_seconds * (running_times + 1),
                            running_times + 1)
                    log.logger.error(msg)
                running_times += 1
                if running_times <= self.retry:
                    time.sleep(self.step_seconds * running_times)
            else:
                msg = "reach the max retry times {} with err:{}, stop running".format(
                    self.retry, msg)
                log.logger.info(msg)

        except Exception as e:
            msg = "get Exception {}.{}".format(type(e), str(e))
            log.logger.error(msg)
        finally:
            process_utils.ref_counter.unref()
            if msg is not None:
                keeper = "unknown"
                log.logger.error("run {}, err: {}".format(self.instance, msg))
                if self.instance.task_type == State.TASK_JOB:
                    job_list = models.TaskDefine().get_job_by_task_id(
                        [self.instance.task_id])
                    if len(job_list) > 0:
                        keeper = "{}({})".format(self.instance.task_id,
                                                 job_list[0].keeper)
                    else:
                        # should not come here
                        keeper = "{}".format(self.instance.task_id)
                elif self.instance.task_type == State.TASK_EXTRACT:
                    keeper = "{}(rule_id:{})".format(self.instance.task_id,
                                                     self.instance.sub_task_id)
                elif self.instance.task_type == State.TASK_CRON:
                    keeper = "{}(定时任务)".format(self.instance.task_id)
                else:
                    pass
                msg = "\nTask: {} \nError: {} \nContext: {}".format(
                    keeper, msg, self.instance)
                process_utils.Alert(msg)
        return