Exemple #1
0
def job_back_in_time(project_id):
    if not config.BACK_IN_TIME_ENABLED:
        return redirect(request.referrer, code=302)

    spider_names = request.form.getlist('spider_name')
    for spider in spider_names:
        job_instance = JobInstance()
        job_instance.project_id = project_id
        job_instance.spider_name = spider

        spider_args = request.form['spider_arguments'].split(",")
        spider_args.append("--callback={}".format(request.form['callback']))
        spider_args.append("SCRAPY_PROJECT=SCRAPY_PROJECT")
        job_instance.spider_arguments = ','.join(spider_args)

        job_instance.priority = request.form.get('priority', 0)
        job_instance.run_type = JobRunType.ONETIME
        job_instance.overlapping = True
        # chose daemon manually
        if request.form['daemon'] != 'auto':
            spider_args = []
            if request.form['spider_arguments']:
                spider_args = request.form['spider_arguments'].split(",")
            spider_args.append("daemon={}".format(request.form['daemon']))
            job_instance.spider_arguments = ','.join(spider_args)

        job_instance.enabled = -1
        db.session.add(job_instance)
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            raise e
        agent.run_back_in_time(job_instance)
    return redirect(request.referrer, code=302)
Exemple #2
0
def _run_spider(spider_name, project_id):
    """
    Run a spider
    :param spider_name:
    :param project_id:
    :return:
    """
    job_instance = JobInstance()
    job_instance.project_id = project_id
    job_instance.spider_name = spider_name
    job_instance.priority = JobPriority.NORMAL
    job_instance.run_type = JobRunType.ONETIME
    job_instance.overlapping = True
    job_instance.enabled = -1

    # settings for tempering the requests
    throttle_value = _get_throttle_value(spider_name, project_id)
    job_instance.spider_arguments = "setting=AUTOTHROTTLE_TARGET_CONCURRENCY={}".format(
        throttle_value)
    job_instance.throttle_concurrency = throttle_value

    db.session.add(job_instance)
    try:
        db.session.commit()
    except Exception as e:
        db.session.rollback()
        raise e

    agent.start_spider(job_instance)
Exemple #3
0
 def log_url(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             return spider_service_instance.log_url(project.project_name, job_instance.spider_name,
                                                    job_execution.service_job_execution_id)
Exemple #4
0
    def process_finished_jobs(self, job_status, job_execution_dict):
        found_jobs = []

        for job_execution_info in job_status[SpiderStatus.FINISHED]:
            found_jobs.append(job_execution_info['id'])

            job_execution = job_execution_dict.get(job_execution_info['id'])
            if not job_execution or job_execution.running_status == SpiderStatus.FINISHED:
                # the minimum check
                continue

            job_execution.start_time = job_execution_info['start_time']
            job_execution.end_time = job_execution_info['end_time']
            job_execution.running_status = SpiderStatus.FINISHED

            res = requests.get(self.log_url(job_execution), headers={"Range": "bytes=-4096"})
            res.encoding = 'utf8'
            match = re.findall(job_execution.RAW_STATS_REGEX, res.text, re.DOTALL)
            if not match:
                continue

            execution_results = match[0]
            job_execution.raw_stats = execution_results
            job_execution.process_raw_stats()

            job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id)
            spider_info = SpiderInfo.get_spider_info(job_instance.spider_name, job_instance.project_id)
            spider_info.update_spider_info(job_execution.raw_stats)

        return found_jobs
 def put(self, project_id, spider_id):
     spider_instance = SpiderInstance.query.filter_by(project_id=project_id,
                                                      id=spider_id).first()
     if not spider_instance: abort(404)
     job_instance = JobInstance()
     job_instance.spider_name = spider_instance.spider_name
     job_instance.project_id = project_id
     job_instance.spider_arguments = request.form.get('spider_arguments')
     job_instance.desc = request.form.get('desc')
     job_instance.tags = request.form.get('tags')
     job_instance.run_type = JobRunType.ONETIME
     job_instance.priority = request.form.get('priority', 0)
     job_instance.enabled = -1
     db.session.add(job_instance)
     db.session.commit()
     agent.start_spider(job_instance)
     return True
Exemple #6
0
 def cancel_spider(self, job_execution):
     job_instance = JobInstance.find_job_instance_by_id(
         job_execution.job_instance_id)
     project = Project.find_project_by_id(job_instance.project_id)
     for spider_service_instance in self.spider_service_instances:
         if spider_service_instance.server == job_execution.running_on:
             if spider_service_instance.cancel_spider(
                     project.project_name,
                     job_execution.service_job_execution_id):
                 job_execution.end_time = datetime.datetime.now()
                 job_execution.running_status = SpiderStatus.CANCELED
                 db.session.commit()
             break
Exemple #7
0
def run_spider_job(job_instance_id):
    '''
    run spider by scheduler
    :param job_instance_id:
    :return:
    '''
    try:
        job_instance = JobInstance.find_job_instance_by_id(job_instance_id)
        agent.start_spider(job_instance)
        app.logger.info(
            '[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]'
            % (job_instance.project_id, job_instance.spider_name,
               job_instance.id))
    except Exception as e:
        app.logger.error('[run_spider_job] ' + str(e))
Exemple #8
0
def run_spider_job(job_instance_id):
    '''
    run spider by scheduler
    :param job_instance_id:
    :return:
    '''
    try:
        job_instance = JobInstance.find_job_instance_by_id(job_instance_id)
        start_tasks = job_instance.start_tasks
        """start_time = datetime.now() - timedelta(minutes=30)
        count = JobExecution.query.filter_by(
            job_instance_id=job_instance_id,
            running_status=SpiderStatus.RUNNING,
        ).filter(JobExecution.start_time < start_time).count()
        if count > 0:
            return"""

        count = JobExecution.query.filter_by(
            job_instance_id=job_instance_id).filter(
                JobExecution.running_status.in_(
                    [SpiderStatus.PENDING, SpiderStatus.RUNNING])).count()
        if count >= job_instance.max_start_tasks:
            return

        slots = job_instance.max_start_tasks - count
        if job_instance.start_tasks > slots:
            start_tasks = slots

        if start_tasks > 0:
            i = 0
            while i < start_tasks:
                agent.start_spider(job_instance)
                i += 1
                app.logger.info(
                    '[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]'
                    '[start_tasks:%s][i:%s]' %
                    (job_instance.project_id, job_instance.spider_name,
                     job_instance.id, start_tasks, i))

    except Exception as e:
        app.logger.error('[run_spider_job] ' + str(e))
def job_addlist(project_id):
    project = Project.find_project_by_id(project_id)
    spider_names = request.form.getlist('spider_name')
    for spider in spider_names:
        job_instance = JobInstance()
        job_instance.project_id = project_id
        job_instance.spider_name = spider
        job_instance.spider_arguments = request.form['spider_arguments']
        job_instance.priority = request.form.get('priority', 0)
        job_instance.run_type = request.form['run_type']
        # chose daemon manually
        if request.form['daemon'] != 'auto':
            spider_args = []
            if request.form['spider_arguments']:
                spider_args = request.form['spider_arguments'].split(",")
            spider_args.append("daemon={}".format(request.form['daemon']))
            job_instance.spider_arguments = ','.join(spider_args)
        if job_instance.run_type == JobRunType.ONETIME:
            job_instance.enabled = -1
            db.session.add(job_instance)
            db.session.commit()
            agent.start_spider(job_instance)
        if job_instance.run_type == JobRunType.PERIODIC:
            job_instance.cron_minutes = request.form.get('cron_minutes') or '0'
            job_instance.cron_hour = request.form.get('cron_hour') or '*'
            job_instance.cron_day_of_month = request.form.get(
                'cron_day_of_month') or '*'
            job_instance.cron_day_of_week = request.form.get(
                'cron_day_of_week') or '*'
            job_instance.cron_month = request.form.get('cron_month') or '*'
            # set cron exp manually
            if request.form.get('cron_exp'):
                job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_month, job_instance.cron_day_of_week = \
                    request.form['cron_exp'].split(' ')
            db.session.add(job_instance)
            db.session.commit()
    return redirect(request.referrer, code=302)
Exemple #10
0
 def post(self, project_id):
     post_data = request.form
     if post_data:
         job_instance = JobInstance()
         job_instance.spider_name = post_data['spider_name']
         job_instance.project_id = project_id
         job_instance.spider_arguments = post_data.get('spider_arguments')
         job_instance.desc = post_data.get('desc')
         job_instance.tags = post_data.get('tags')
         job_instance.run_type = post_data['run_type']
         job_instance.priority = post_data.get('priority', 0)
         if job_instance.run_type == "periodic":
             job_instance.cron_minutes = post_data.get(
                 'cron_minutes') or '0'
             job_instance.cron_hour = post_data.get('cron_hour') or '*'
             job_instance.cron_day_of_month = post_data.get(
                 'cron_day_of_month') or '*'
             job_instance.cron_day_of_week = post_data.get(
                 'cron_day_of_week') or '*'
             job_instance.cron_month = post_data.get('cron_month') or '*'
         db.session.add(job_instance)
         db.session.commit()
         return True