예제 #1
0
def reload_runnable_spider_job_execution():
    '''
    add periodic job to scheduler
    :return:
    '''
    running_job_ids = set([job.id for job in scheduler.get_jobs()])
    app.logger.debug('[running_job_ids] %s' % ','.join(running_job_ids))
    available_job_ids = set()
    # add new job to schedule
    for job_instance in JobInstance.query.filter_by(enabled=0, run_type="periodic").all():
        job_id = "spider_job_%s:%s" % (job_instance.id, int(time.mktime(job_instance.date_modified.timetuple())))
        available_job_ids.add(job_id)
        if job_id not in running_job_ids:
            scheduler.add_job(run_spider_job,
                              args=(job_instance.id,),
                              trigger='cron',
                              id=job_id,
                              minute=job_instance.cron_minutes,
                              hour=job_instance.cron_hour,
                              day=job_instance.cron_day_of_month,
                              day_of_week=job_instance.cron_day_of_week,
                              month=job_instance.cron_month,
                              second=0,
                              max_instances=999,
                              misfire_grace_time=60 * 60,
                              coalesce=True)
            app.logger.info('[load_spider_job][project:%s][spider_name:%s][job_instance_id:%s][job_id:%s]' % (
                job_instance.project_id, job_instance.spider_name, job_instance.id, job_id))
    # remove invalid jobs
    for invalid_job_id in filter(lambda job_id: job_id.startswith("spider_job_"),
                                 running_job_ids.difference(available_job_ids)):
        scheduler.remove_job(invalid_job_id)
        app.logger.info('[drop_spider_job][job_id:%s]' % invalid_job_id)
예제 #2
0
def reload_runnable_spider_job_execution():
    """
    add periodic job to scheduler
    :return:
    """
    running_job_ids = set([job.id for job in scheduler.get_jobs()])  # 从APScheduler中获取当前正在运行的job
    app.logger.debug('[当前正在运行的任务id有: ] %s' % ','.join(running_job_ids))
    # 可以调度的job_id集合
    available_job_ids = set()
    # add new job to schedule
    # 从数据库里面取出所有可以被调度的任务
    for job_instance in JobInstance.query.filter_by(enabled=0, run_type="periodic").all():
        # 构造job_id字符串 spider_job_1:180230238
        job_id = "spider_job_%s:%s" % (job_instance.id, int(time.mktime(job_instance.date_modified.timetuple())))
        # 插入到可以调度的job_id集合里面
        available_job_ids.add(job_id)
        # 如果job_id不在APScheduler现在调度的job里面, 添加该调度任务
        if job_id not in running_job_ids:
            scheduler.add_job(run_spider_job,
                              args=(job_instance.id,),
                              trigger='cron',
                              id=job_id,
                              minute='{}'.format(job_instance.cron_minutes),
                              hour='{}'.format(job_instance.cron_hour),
                              day='{}'.format(job_instance.cron_day_of_month),
                              day_of_week=job_instance.cron_day_of_week,
                              month='{}'.format(job_instance.cron_month),
                              second=0,
                              max_instances=999,
                              misfire_grace_time=60 * 60,
                              coalesce=True)
            '''
            关于scheduler参数的含义
            一个job可能由于某些情况错过执行时间, 比如上一点提到的, 或者是线程池或进程池用光了,
            或者是当要调度job时, 突然down机了等
            这时可以通过设置job的misfire_grace_time选项来指示之后尝试执行的次数
            当然如果这不符合你的期望, 你可以合并所有错过时间的job到一个job来执行, 通过设定job的coalesce = True
            '''
            app.logger.info('[APScheduler调度器中装载了一个爬虫蜘蛛] [是项目名称为: %s 的] [%s 蜘蛛]'
                            ' [这条任务在数据库中的任务id为: %s] [调度器中的job_id为:%s]'
                            % (job_instance.project_id, job_instance.spider_name, job_instance.id, job_id))

    # 删除无效的job_id
    for invalid_job_id in filter(lambda job_id: job_id.startswith("spider_job_"),
                                 running_job_ids.difference(available_job_ids)):
        scheduler.remove_job(invalid_job_id)
        app.logger.info('[从调度器中删除掉了一个调度任务] [任务id为: %s]' % invalid_job_id)
예제 #3
0
def reload_runnable_spider_job_execution():
    '''
    add periodic job to scheduler
    :return:
    '''
    running_job_ids = set([job.id for job in scheduler.get_jobs()])
    # app.logger.debug('[running_job_ids] %s' % ','.join(running_job_ids))
    available_job_ids = set()
    # add new job to schedule
    for job_instance in JobInstance.query.filter_by(enabled=0, run_type="periodic").all():
        job_id = "spider_job_%s:%s" % (job_instance.id, int(time.mktime(job_instance.date_modified.timetuple())))
        available_job_ids.add(job_id)
        if job_id not in running_job_ids:
            try:
                scheduler.add_job(run_spider_job,
                                  args=(job_instance.id,),
                                  trigger='cron',
                                  id=job_id,
                                  minute=job_instance.cron_minutes,
                                  hour=job_instance.cron_hour,
                                  day=job_instance.cron_day_of_month,
                                  day_of_week=job_instance.cron_day_of_week,
                                  month=job_instance.cron_month,
                                  second=0,
                                  max_instances=999,
                                  misfire_grace_time=60 * 60,
                                  coalesce=True)
            except Exception as e:
                app.logger.error(
                    '[load_spider_job] failed {} {},may be cron expression format error '.format(job_id, str(e)))
            app.logger.info('[load_spider_job][project:%s][spider_name:%s][job_instance_id:%s][job_id:%s]' % (
                job_instance.project_id, job_instance.spider_name, job_instance.id, job_id))
    # remove invalid jobs
    for invalid_job_id in filter(lambda job_id: job_id.startswith("spider_job_"),
                                 running_job_ids.difference(available_job_ids)):
        scheduler.remove_job(invalid_job_id)
        app.logger.info('[drop_spider_job][job_id:%s]' % invalid_job_id)