def project_update(): stat_url = request.form['stat_url'] project_id = request.form['project_id'] project = Project.find_project_by_id(project_id) project.stat_url = stat_url db.session.commit() return redirect("/project/manage", code=302)
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name arguments = {} if job_instance.spider_arguments: arguments = dict( map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit()
def log_url(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: return spider_service_instance.log_url(project.project_name, job_instance.spider_name, job_execution.service_job_execution_id)
def job_periodic(project_id): project = Project.find_project_by_id(project_id) job_instance_list = [job_instance.to_dict() for job_instance in JobInstance.query.filter_by(run_type="periodic", project_id=project_id).order_by( JobInstance.id).all()] # ADD order_by return render_template("job_periodic.html", job_instance_list=job_instance_list)
def download_items(project_id, job_exec_id): format = request.args.get('format') if not format in ['json', 'csv']: abort(404) job_execution = JobExecution.query.filter_by(project_id=project_id, id=job_exec_id).first() job_instance = JobInstance.find_job_instance_by_id( job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) res = requests.get(agent.items_url(job_execution)) res.encoding = 'utf8' json_data = [json.loads(s) for s in filter(None, res.text.split('\n'))] filename = '{}-{}.{}'.format(project.project_name, job_instance.spider_name, format) if format == 'json': open(os.path.join(app.static_folder, filename), 'w').write(json.dumps(json_data)) elif format == 'csv': f = open(os.path.join(app.static_folder, filename), 'w') csvwriter = csv.writer(f) count = 0 for item in json_data: if count == 0: header = item.keys() csvwriter.writerow(header) count += 1 csvwriter.writerow(item.values()) f.close() return send_from_directory(app.static_folder, filename, as_attachment=True)
def get(self, project_id): project = Project.find_project_by_id(project_id) return [ spider_instance.to_dict() for spider_instance in SpiderInstance.query.filter_by( project_id=project_id).all() ]
def job_add(project_id): project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] # chose daemon manually if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' # set cron exp manually if request.form.get('cron_exp'): job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_day_of_week, job_instance.cron_month = \ request.form['cron_exp'].split(' ') db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)
def log_url_slave(self, job_execution): """ 功能: 获取从爬虫的日志,只要获取一个 :param job_execution: job_execution对象 :return: 返回log的url """ job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) # 主从爬虫运行的服务器字符串 service_job_execution_id = job_execution.service_job_execution_id.split('>') # 从爬虫服务器列表 slave_service_job_execution_id = service_job_execution_id[1].split(',') # 爬虫运行的服务器地址 running_on = job_execution.running_on.split('>') slave_running_on = running_on[1].split(',') # 调用从爬虫的日志 spider_name_slave_obj = SpiderInstance.query.filter_by( spider_name=job_instance.spider_name, project_id=job_instance.project_id).first() spider_name_slave = spider_name_slave_obj.spider_name_slave for spider_service_instance in self.spider_service_instances_slave: for job_execution_id, running_on_ in zip(slave_service_job_execution_id, slave_running_on): if spider_service_instance.server == running_on_: slave_log_url = spider_service_instance.log_url( project.project_name, spider_name_slave, job_execution_id) return slave_log_url
def job_add(project_id): # Save the upload file, and save the file path to the # job_instance.spider_arguments dst = '' if 'file' in request.files: file = request.files['file'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': pass if file and allowed_seed(file.filename): filename = secure_filename(file.filename) dst = os.path.join( app.config['UPLOAD_DIR'], datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-") + filename) file.save(dst) project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] if dst: if job_instance.spider_arguments: job_instance.spider_arguments += (",seed={}".format(dst)) else: job_instance.spider_arguments = "seed={}".format(dst) job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] # chose daemon manually if request.form['daemon'] != 'auto': spider_args = [] if request.form['spider_arguments']: spider_args = request.form['spider_arguments'].split(",") spider_args.append("daemon={}".format(request.form['daemon'])) job_instance.spider_arguments = ','.join(spider_args) if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' # set cron exp manually if request.form.get('cron_exp'): job_instance.cron_minutes, job_instance.cron_hour, job_instance.cron_day_of_month, job_instance.cron_day_of_week, job_instance.cron_month = \ request.form['cron_exp'].split(' ') db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)
def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id): job_execution.end_time = datetime.datetime.now() job_execution.running_status = SpiderStatus.CANCELED db.session.commit() break
def project_delete(project_id): project = Project.find_project_by_id(project_id) agent.delete_project(project) try: db.session.delete(project) db.session.commit() except: db.session.rollback() raise return redirect("/project/manage", code=302)
def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) # TODO multi service for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id): job_execution.running_status = SpiderStatus.CANCELED db.session.commit() break
def get_all_spiders_info(): """ 功能: 返回所有的爬虫的相关信息 :return: 返回数据格式: json, 样式如下: { "last_run_status": "success", "last_run_time": "2018-09-19 03:30:45", "project_alias": "深圳市共享开放平台", "project_id": 1, "project_name": "opendata_sz", "spider_alias": "所有数据", "spider_id": 1, "spider_name": "list" } """ # 保存数据的临时列表 data = [] # 遍历实例数据库, 获取爬虫信息 job_instance = JobInstance.query.order_by( JobInstance.date_created).group_by(JobInstance.project_id).all() for spider in job_instance: # 得到实例的字典信息 _temp = spider.to_dict() # 依据工程id查找Project数据库, 获取工程名以及备注信息 project_base_info = Project.find_project_by_id(_temp['project_id']) instance_to_job_execution = JobExecution.query.filter_by( job_instance_id=_temp['job_instance_id']).all() if instance_to_job_execution: # 获取状态信息 _status = instance_to_job_execution[-1].running_status # 状态信息格式转变 status = switcher.get(_status, "CANCELED") # 获取实例的上一次运行时间 last_run_time = instance_to_job_execution[-1].end_time service_job_execution_id = instance_to_job_execution[ -1].service_job_execution_id else: status = 'PENDING' last_run_time = None service_job_execution_id = None # 将信息封装成字典格式 _dict = dict(project_id=_temp['project_id'], project_name=project_base_info.project_name, project_alias=project_base_info.project_alias, spider_id=SpiderInstance.query.filter_by( project_id=_temp['project_id']).first().id, spider_name=_temp['spider_name'], spider_alias=_temp['desc'], last_run_status=status, last_run_time=str(last_run_time).split('.')[0], run_type=_temp['run_type'], job_exec_id=service_job_execution_id, is_msd=project_base_info.is_msd) data.append(_dict) return json.dumps({"code": 200, 'data': data})
def inject_project(): project_context = {} project_context['project_list'] = Project.query.all() if project_context['project_list'] and (not session.get('project_id')): project = Project.query.first() session['project_id'] = project.id if session.get('project_id'): project_context['project'] = Project.find_project_by_id(session['project_id']) project_context['spider_list'] = [spider_instance.to_dict() for spider_instance in SpiderInstance.query.filter_by(project_id=session['project_id']).all()] else: project_context['project'] = {} return project_context
def cancel_spider(self, job_execution): job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) for spider_service_instance in self.spider_service_instances: if spider_service_instance.server == job_execution.running_on: if spider_service_instance.cancel_spider(project.project_name, job_execution.service_job_execution_id): job_execution.end_time = datetime.datetime.now() job_execution.running_status = SpiderStatus.CANCELED try: db.session.commit() except: db.session.rollback() raise break
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) # job_execution = JobExecution.find_job_by_service_id(job_instance.project_id) spider_name = job_instance.spider_name arguments = {} if job_instance.spider_arguments: arguments = dict( map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon']: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: # add more arguments to scrapyd to run a spider arguments['project_id'] = job_instance.project_id arguments['project_name'] = project.project_name arguments['job_instance_id'] = job_instance.job_instance_id arguments['priority'] = job_instance.priority arguments['args'] = job_instance.spider_arguments arguments['execute_ip'] = leader.server arguments['create_time'] = datetime.datetime.now() serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.job_instance_id job_execution.create_time = arguments['create_time'] job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit()
def spider_egg_upload(project_id): project = Project.find_project_by_id(project_id) if 'file' not in request.files: flash('No file part') return redirect(request.referrer) file = request.files['file'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': flash('No selected file') return redirect(request.referrer) if file: filename = secure_filename(file.filename) dst = os.path.join(tempfile.gettempdir(), filename) file.save(dst) agent.deploy(project, dst) flash('deploy success!') return redirect(request.referrer)
def clear_jobexecution(job_execution): """ clear_jobexecution check JobExecution still existed on scrapyd servers delete it if didn't existed anymore. :param job_execution: :return: """ job_instance = JobInstance.find_job_instance_by_id( job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) if not check_job_existed(running_on=job_execution.running_on, project_name=project.project_name, spider_name=job_instance.spider_name, job_id=job_execution.service_job_execution_id): db.session.delete(job_execution) db.session.commit()
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name #arguments = {} #if job_instance.spider_arguments: # arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list(map(lambda x: x.split('=', 1), job_instance.spider_arguments.split(','))): arguments[k].append(v) # threshold = 0 # daemon_size = len(self.spider_service_instances) # if job_instance.priority == JobPriority.HIGH: # threshold = int(daemon_size / 2) # if job_instance.priority == JobPriority.HIGHEST: # threshold = int(daemon_size) # threshold = 1 if threshold == 0 else threshold threshold = 1 candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon']: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server try: db.session.add(job_execution) db.session.commit() except: db.session.rollback() raise
def log_url_master(self, job_execution): """ 功能: 获取主爬虫的日志 :param job_execution: job_execution对象 :return: 返回log的url """ job_instance = JobInstance.find_job_instance_by_id(job_execution.job_instance_id) project = Project.find_project_by_id(job_instance.project_id) # 主从爬虫运行的服务器字符串 service_job_execution_id = job_execution.service_job_execution_id.split('>') # 主爬虫服务器 master_service_job_execution_id = service_job_execution_id[0] # 爬虫运行的服务器地址 running_on = job_execution.running_on.split('>') master_running_on = running_on[0] # 调用主爬虫的日志 for spider_service_instance in self.spider_service_instances_master: if spider_service_instance.server == master_running_on: master_log_url = spider_service_instance.log_url( project.project_name, job_instance.spider_name, master_service_job_execution_id) return master_log_url
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name #arguments = {} #if job_instance.spider_arguments: # arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list(map(lambda x: x.split('=', 1), job_instance.spider_arguments.split(','))): arguments[k].append(v) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon']: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit()
def job_add(project_id): project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)
def job_add(project_id): project = Project.find_project_by_id(project_id) job_instance = JobInstance() job_instance.spider_name = request.form['spider_name'] job_instance.project_id = project_id job_instance.spider_arguments = request.form['spider_arguments'] job_instance.priority = request.form.get('priority', 0) job_instance.run_type = request.form['run_type'] spider_url = request.form['spider_url'] spider_models = request.form['spider_models'] if job_instance.spider_name == "news": allowed_domains = spider_url.split('/')[2] a = "allowed_domains=" + allowed_domains a = a + "," + "model=" + spider_models job_instance.spider_arguments = a r = CRedis() r.lpush(allowed_domains + ':start_urls', spider_url) elif job_instance.spider_name == 'jd': r = CRedis() r.lpush('jd:start_urls', spider_url) if job_instance.run_type == JobRunType.ONETIME: job_instance.enabled = -1 db.session.add(job_instance) db.session.commit() agent.start_spider(job_instance) if job_instance.run_type == JobRunType.PERIODIC: job_instance.cron_minutes = request.form.get('cron_minutes') or '0' job_instance.cron_hour = request.form.get('cron_hour') or '*' job_instance.cron_day_of_month = request.form.get( 'cron_day_of_month') or '*' job_instance.cron_day_of_week = request.form.get( 'cron_day_of_week') or '*' job_instance.cron_month = request.form.get('cron_month') or '*' db.session.add(job_instance) db.session.commit() return redirect(request.referrer, code=302)
def spider_deploy(project_id): project = Project.find_project_by_id(project_id) spider_instance_list = agent.get_spider_list(project) SpiderInstance.update_spider_instances(spider_instance_list) return render_template("spider_deploy.html")
def service_stats(project_id): project = Project.find_project_by_id(project_id) run_stats = JobExecution.list_run_stats_by_hours(project_id) return render_template("server_stats.html", run_stats=run_stats)
def get(self, project_id): project = Project.find_project_by_id(project_id) return agent.get_spider_list(project) if project else []
def spider_deploy(project_id): project = Project.find_project_by_id(project_id) return render_template("spider_deploy.html")
def project_stats(project_id): project = Project.find_project_by_id(project_id) # run_stats = JobExecution.list_run_stats_by_hours(project_id) run_stats = JobExecution.list_run_stats_by_days(project_id) return render_template("project_stats.html", run_stats=run_stats)
def project_delete(project_id): project = Project.find_project_by_id(project_id) agent.delete_project(project) db.session.delete(project) db.session.commit() return redirect("/project/manage", code=302)
def start_spider(self, job_instance): """ 功能: 启动爬虫,首先启动从爬虫, 至少有一个从爬虫启动成功后启动主爬虫 :param job_instance: job_instance对象 :return: None """ project = Project.find_project_by_id(job_instance.project_id) if project.is_msd == '0': # 如果是单机爬虫 spider_name = job_instance.spider_name for leader in self.spider_service_instances_master: serviec_job_id = leader.start_spider(project.project_name, spider_name) # 如果启动成功 if serviec_job_id: job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id + '>' job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server + '>' db.session.add(job_execution) db.session.commit() break else: # 主爬虫名 spider_name_master = job_instance.spider_name spider_instance = SpiderInstance.query.filter_by( project_id=job_instance.project_id, spider_name=spider_name_master).first() # 从爬虫名 spider_name_slave = spider_instance.spider_name_slave # 启动从爬虫服务器启动成功标志 slave_flag = False # 从爬虫的job执行列表 serviec_job_id_slave = [] # 从爬虫运行的服务器列表 running_on_slave = [] # 遍历从爬虫服务器 for leader in self.spider_service_instances_slave: # 启动爬虫, 爬虫启动成功,返回id, 否则返回None serviec_job_id = leader.start_spider(project.project_name, spider_name_slave) # 如果启动成功 if serviec_job_id: # 标志为True slave_flag = True # job_id添加到列表, 为日志获取提供数据 serviec_job_id_slave.append(serviec_job_id) # 运行的服务器添加到列表, 为日志获取提供数据 running_on_slave.append(leader.server) # 将列表转换为字符串 serviec_job_id_slave_str = ','.join(serviec_job_id_slave) running_on_slave_str = ','.join(running_on_slave) # 从爬虫服务器至少有一个启动成功,则启动主爬虫服务器 if slave_flag: for leader in self.spider_service_instances_master: serviec_job_id = leader.start_spider(project.project_name, spider_name_master) # 如果启动成功 if serviec_job_id: job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id+'>'+serviec_job_id_slave_str job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server+'>'+running_on_slave_str db.session.add(job_execution) db.session.commit() break
def get(self, project_id): project = Project.find_project_by_id(project_id) return agent.get_job_status(project)
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name task_id = job_instance.id if job_instance.keywords is not None: keywords_list = job_instance.keywords.strip(',').split(',') for keywords in keywords_list: arguments = {} if job_instance.spider_arguments: arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) threshold = 0 # 阈值 leaders = [] arguments['keywords'] = keywords arguments['video_time_short'] = job_instance.video_time_short arguments['video_time_long'] = job_instance.video_time_long if job_instance.upload_time_type == '设定区间': # 任务运行周期内自动设定最优时间参数 arguments['startDate'] = dts2ts(job_instance.upload_time_start_date) arguments['endDate'] = dts2ts(job_instance.upload_time_end_date) else: arguments['startDate'] = int(time.time()) - 3600*24*job_instance.spider_freq - 3600*24 arguments['endDate'] = int(time.time()) arguments['task_id'] = task_id # 将任务id加入到爬虫 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon']: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: print(project.project_name, spider_name, arguments) serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit() else: arguments = {} if job_instance.spider_arguments: arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) threshold = 0 # 阈值 leaders = [] arguments['video_time_short'] = job_instance.video_time_short arguments['video_time_long'] = job_instance.video_time_long arguments['startDate'] = dts2ts(job_instance.upload_time_start_date) arguments['endDate'] = dts2ts(job_instance.upload_time_end_date) arguments['task_id'] = task_id # 将任务id加入到爬虫 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon']: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: print(project.project_name, spider_name, arguments) serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit()