def job_back_in_time(project_id): if not config.BACK_IN_TIME_ENABLED: return redirect(request.referrer, code=302) spider_names = request.form.getlist('spider_name') for spider in spider_names: job_instance = JobInstance() job_instance.project_id = project_id job_instance.spider_name = spider spider_args = request.form['spider_arguments'].split(",") spider_args.append("--callback={}".format(request.form['callback'])) spider_args.append("SCRAPY_PROJECT=SCRAPY_PROJECT") job_instance.spider_arguments = ','.join(spider_args) job_instance.priority = request.form.get('priority', 0) job_instance.run_type = JobRunType.ONETIME job_instance.overlapping = True # chose daemon manually if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) job_instance.enabled = -1 db.session.add(job_instance) try: db.session.commit() except Exception as e: db.session.rollback() raise e agent.run_back_in_time(job_instance) return redirect(request.referrer, code=302)
def _run_spider(spider_name, project_id): """ Run a spider :param spider_name: :param project_id: :return: """ job_instance = JobInstance() job_instance.project_id = project_id job_instance.spider_name = spider_name job_instance.priority = JobPriority.NORMAL job_instance.run_type = JobRunType.ONETIME job_instance.overlapping = True job_instance.enabled = -1 # settings for tempering the requests throttle_value = _get_throttle_value(spider_name, project_id) job_instance.spider_arguments = "setting=AUTOTHROTTLE_TARGET_CONCURRENCY={}".format( throttle_value) job_instance.throttle_concurrency = throttle_value db.session.add(job_instance) try: db.session.commit() except Exception as e: db.session.rollback() raise e agent.start_spider(job_instance)
def log_url(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: return spider_service_instance.log_url(project.project_name, job_instance.spider_name, job_execution.service_job_execution_id)
def process_finished_jobs(self, job_status, job_execution_dict): found_jobs = [] for job_execution_info in job_status[SpiderStatus.FINISHED]: found_jobs.append(job_execution_info['id']) job_execution = job_execution_dict.get(job_execution_info['id']) if not job_execution or job_execution.running_status == SpiderStatus.FINISHED: # the minimum check continue job_execution.start_time = job_execution_info['start_time'] job_execution.end_time = job_execution_info['end_time'] job_execution.running_status = SpiderStatus.FINISHED res = requests.get(self.log_url(job_execution), headers={"Range": "bytes=-4096"}) res.encoding = 'utf8' match = re.findall(job_execution.RAW_STATS_REGEX, res.text, re.DOTALL) if not match: continue execution_results = match[0] job_execution.raw_stats = execution_results job_execution.process_raw_stats() job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) spider_info = SpiderInfo.get_spider_info(job_instance.spider_name, job_instance.project_id) spider_info.update_spider_info(job_execution.raw_stats) return found_jobs
def put(self, project_id, spider_id): spider_instance = SpiderInstance.query.filter_by(project_id=project_id, id=spider_id).first() if not spider_instance: abort(404) job_instance = JobInstance() job_instance.spider_name = spider_instance.spider_name job_instance.project_id = project_id job_instance.spider_arguments = request.form.get('spider_arguments') job_instance.desc = request.form.get('desc') job_instance.tags = request.form.get('tags') job_instance.run_type = JobRunType.ONETIME job_instance.priority = request.form.get('priority', 0) job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) return True
def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id( job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: if spider_service_instance.cancel_spider( project.project_name, job_execution.service_job_execution_id): job_execution.end_time = datetime.datetime.now() job_execution.running_status = SpiderStatus.CANCELED db.session.commit() break
def run_spider_job(job_instance_id): ''' run spider by scheduler :param job_instance_id: :return: ''' try: job_instance = JobInstance.find_job_instance_by_id(job_instance_id) agent.start_spider(job_instance) app.logger.info( '[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]' % (job_instance.project_id, job_instance.spider_name, job_instance.id)) except Exception as e: app.logger.error('[run_spider_job] ' + str(e))
def run_spider_job(job_instance_id): ''' run spider by scheduler :param job_instance_id: :return: ''' try: job_instance = JobInstance.find_job_instance_by_id(job_instance_id) start_tasks = job_instance.start_tasks """start_time = datetime.now() - timedelta(minutes=30) count = JobExecution.query.filter_by( job_instance_id=job_instance_id, running_status=SpiderStatus.RUNNING, ).filter(JobExecution.start_time < start_time).count() if count > 0: return""" count = JobExecution.query.filter_by( job_instance_id=job_instance_id).filter( JobExecution.running_status.in_( [SpiderStatus.PENDING, SpiderStatus.RUNNING])).count() if count >= job_instance.max_start_tasks: return slots = job_instance.max_start_tasks - count if job_instance.start_tasks > slots: start_tasks = slots if start_tasks > 0: i = 0 while i < start_tasks: agent.start_spider(job_instance) i += 1 app.logger.info( '[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]' '[start_tasks:%s][i:%s]' % (job_instance.project_id, job_instance.spider_name, job_instance.id, start_tasks, i)) except Exception as e: app.logger.error('[run_spider_job] ' + str(e))
def job_addlist(project_id): project = Project.find_project_by_id(project_id) spider_names = request.form.getlist('spider_name') for spider in spider_names: job_instance = JobInstance() job_instance.project_id = project_id job_instance.spider_name = spider job_instance.spider_arguments = request.form['spider_arguments'] job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] # chose daemon manually if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' # set cron exp manually if request.form.get('cron_exp'): job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_month, job_instance.cron_day_of_week = \ request.form['cron_exp'].split(' ') db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)
def post(self, project_id): post_data = request.form if post_data: job_instance = JobInstance() job_instance.spider_name = post_data['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = post_data.get('spider_arguments') job_instance.desc = post_data.get('desc') job_instance.tags = post_data.get('tags') job_instance.run_type = post_data['run_type'] job_instance.priority = post_data.get('priority', 0) if job_instance.run_type == "periodic": job_instance.cron_minutes = post_data.get( 'cron_minutes') or '0' job_instance.cron_hour = post_data.get('cron_hour') or '*' job_instance.cron_day_of_month = post_data.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = post_data.get( 'cron_day_of_week') or '*' job_instance.cron_month = post_data.get('cron_month') or '*' db.session.add(job_instance) db.session.commit() return True