def log_url(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: return spider_service_instance.log_url(project.project_name, job_instance.spider_name, job_execution.service_job_execution_id)
def process_finished_jobs(self, job_status, job_execution_dict): found_jobs = [] for job_execution_info in job_status[SpiderStatus.FINISHED]: found_jobs.append(job_execution_info['id']) job_execution = job_execution_dict.get(job_execution_info['id']) if not job_execution or job_execution.running_status == SpiderStatus.FINISHED: # the minimum check continue job_execution.start_time = job_execution_info['start_time'] job_execution.end_time = job_execution_info['end_time'] job_execution.running_status = SpiderStatus.FINISHED res = requests.get(self.log_url(job_execution), headers={"Range": "bytes=-4096"}) res.encoding = 'utf8' match = re.findall(job_execution.RAW_STATS_REGEX, res.text, re.DOTALL) if not match: continue execution_results = match[0] job_execution.raw_stats = execution_results job_execution.process_raw_stats() job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) spider_info = SpiderInfo.get_spider_info(job_instance.spider_name, job_instance.project_id) spider_info.update_spider_info(job_execution.raw_stats) return found_jobs
def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id( job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: if spider_service_instance.cancel_spider( project.project_name, job_execution.service_job_execution_id): job_execution.end_time = datetime.datetime.now() job_execution.running_status = SpiderStatus.CANCELED db.session.commit() break
def run_spider_job(job_instance_id): ''' run spider by scheduler :param job_instance_id: :return: ''' try: job_instance = JobInstance.find_job_instance_by_id(job_instance_id) agent.start_spider(job_instance) app.logger.info( '[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]' % (job_instance.project_id, job_instance.spider_name, job_instance.id)) except Exception as e: app.logger.error('[run_spider_job] ' + str(e))
def run_spider_job(job_instance_id): ''' run spider by scheduler :param job_instance_id: :return: ''' try: job_instance = JobInstance.find_job_instance_by_id(job_instance_id) start_tasks = job_instance.start_tasks """start_time = datetime.now() - timedelta(minutes=30) count = JobExecution.query.filter_by( job_instance_id=job_instance_id, running_status=SpiderStatus.RUNNING, ).filter(JobExecution.start_time < start_time).count() if count > 0: return""" count = JobExecution.query.filter_by( job_instance_id=job_instance_id).filter( JobExecution.running_status.in_( [SpiderStatus.PENDING, SpiderStatus.RUNNING])).count() if count >= job_instance.max_start_tasks: return slots = job_instance.max_start_tasks - count if job_instance.start_tasks > slots: start_tasks = slots if start_tasks > 0: i = 0 while i < start_tasks: agent.start_spider(job_instance) i += 1 app.logger.info( '[run_spider_job][project:%s][spider_name:%s][job_instance_id:%s]' '[start_tasks:%s][i:%s]' % (job_instance.project_id, job_instance.spider_name, job_instance.id, start_tasks, i)) except Exception as e: app.logger.error('[run_spider_job] ' + str(e))