def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name arguments = {} if job_instance.spider_arguments: arguments = dict( map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit()
def create_job_execution(self, job, project_id): from SpiderKeeper.app.spider.model import JobExecution, JobInstance, JobRunType from SpiderKeeper.app import agent from SpiderKeeper.app import db execution_id = job.get('id', 0) if JobExecution.query.filter_by( service_job_execution_id=execution_id).first(): return job_instance = JobInstance() job_instance.spider_name = job.get('spider', 'unknown') job_instance.project_id = project_id job_instance.spider_arguments = '' job_instance.priority = 0 job_instance.run_type = JobRunType.ONETIME db.session.add(job_instance) db.session.commit() job_execution = JobExecution() job_execution.project_id = project_id job_execution.service_job_execution_id = execution_id job_execution.job_instance_id = 0 job_execution.create_time = self.convert_time(job, 'start_time') job_execution.end_time = self.convert_time(job, 'end_time') job_execution.running_on = agent.spider_service_instances[0].server job_execution.job_instance = job_instance job_execution.job_instance_id = job_instance.id db.session.add(job_execution) db.session.commit()
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) # job_execution = JobExecution.find_job_by_service_id(job_instance.project_id) spider_name = job_instance.spider_name arguments = {} if job_instance.spider_arguments: arguments = dict( map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon']: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: # add more arguments to scrapyd to run a spider arguments['project_id'] = job_instance.project_id arguments['project_name'] = project.project_name arguments['job_instance_id'] = job_instance.job_instance_id arguments['priority'] = job_instance.priority arguments['args'] = job_instance.spider_arguments arguments['execute_ip'] = leader.server arguments['create_time'] = datetime.datetime.now() serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.job_instance_id job_execution.create_time = arguments['create_time'] job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit()
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name #arguments = {} #if job_instance.spider_arguments: # arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list(map(lambda x: x.split('=', 1), job_instance.spider_arguments.split(','))): arguments[k].append(v) # threshold = 0 # daemon_size = len(self.spider_service_instances) # if job_instance.priority == JobPriority.HIGH: # threshold = int(daemon_size / 2) # if job_instance.priority == JobPriority.HIGHEST: # threshold = int(daemon_size) # threshold = 1 if threshold == 0 else threshold threshold = 1 candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon']: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server try: db.session.add(job_execution) db.session.commit() except: db.session.rollback() raise
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name #arguments = {} #if job_instance.spider_arguments: # arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) from collections import defaultdict arguments = defaultdict(list) if job_instance.spider_arguments: for k, v in list(map(lambda x: x.split('=', 1), job_instance.spider_arguments.split(','))): arguments[k].append(v) threshold = 0 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances leaders = [] if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon']: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit()
def start_spider(self, job_instance): project = Project.find_project_by_id(job_instance.project_id) spider_name = job_instance.spider_name task_id = job_instance.id if job_instance.keywords is not None: keywords_list = job_instance.keywords.strip(',').split(',') for keywords in keywords_list: arguments = {} if job_instance.spider_arguments: arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) threshold = 0 # 阈值 leaders = [] arguments['keywords'] = keywords arguments['video_time_short'] = job_instance.video_time_short arguments['video_time_long'] = job_instance.video_time_long if job_instance.upload_time_type == '设定区间': # 任务运行周期内自动设定最优时间参数 arguments['startDate'] = dts2ts(job_instance.upload_time_start_date) arguments['endDate'] = dts2ts(job_instance.upload_time_end_date) else: arguments['startDate'] = int(time.time()) - 3600*24*job_instance.spider_freq - 3600*24 arguments['endDate'] = int(time.time()) arguments['task_id'] = task_id # 将任务id加入到爬虫 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon']: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: print(project.project_name, spider_name, arguments) serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit() else: arguments = {} if job_instance.spider_arguments: arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(","))) threshold = 0 # 阈值 leaders = [] arguments['video_time_short'] = job_instance.video_time_short arguments['video_time_long'] = job_instance.video_time_long arguments['startDate'] = dts2ts(job_instance.upload_time_start_date) arguments['endDate'] = dts2ts(job_instance.upload_time_end_date) arguments['task_id'] = task_id # 将任务id加入到爬虫 daemon_size = len(self.spider_service_instances) if job_instance.priority == JobPriority.HIGH: threshold = int(daemon_size / 2) if job_instance.priority == JobPriority.HIGHEST: threshold = int(daemon_size) threshold = 1 if threshold == 0 else threshold candidates = self.spider_service_instances if 'daemon' in arguments: for candidate in candidates: if candidate.server == arguments['daemon']: leaders = [candidate] else: # TODO optimize some better func to vote the leader for i in range(threshold): leaders.append(random.choice(candidates)) for leader in leaders: print(project.project_name, spider_name, arguments) serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments) job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server db.session.add(job_execution) db.session.commit()
def start_spider(self, job_instance): """ 功能: 启动爬虫,首先启动从爬虫, 至少有一个从爬虫启动成功后启动主爬虫 :param job_instance: job_instance对象 :return: None """ project = Project.find_project_by_id(job_instance.project_id) if project.is_msd == '0': # 如果是单机爬虫 spider_name = job_instance.spider_name for leader in self.spider_service_instances_master: serviec_job_id = leader.start_spider(project.project_name, spider_name) # 如果启动成功 if serviec_job_id: job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id + '>' job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server + '>' db.session.add(job_execution) db.session.commit() break else: # 主爬虫名 spider_name_master = job_instance.spider_name spider_instance = SpiderInstance.query.filter_by( project_id=job_instance.project_id, spider_name=spider_name_master).first() # 从爬虫名 spider_name_slave = spider_instance.spider_name_slave # 启动从爬虫服务器启动成功标志 slave_flag = False # 从爬虫的job执行列表 serviec_job_id_slave = [] # 从爬虫运行的服务器列表 running_on_slave = [] # 遍历从爬虫服务器 for leader in self.spider_service_instances_slave: # 启动爬虫, 爬虫启动成功,返回id, 否则返回None serviec_job_id = leader.start_spider(project.project_name, spider_name_slave) # 如果启动成功 if serviec_job_id: # 标志为True slave_flag = True # job_id添加到列表, 为日志获取提供数据 serviec_job_id_slave.append(serviec_job_id) # 运行的服务器添加到列表, 为日志获取提供数据 running_on_slave.append(leader.server) # 将列表转换为字符串 serviec_job_id_slave_str = ','.join(serviec_job_id_slave) running_on_slave_str = ','.join(running_on_slave) # 从爬虫服务器至少有一个启动成功,则启动主爬虫服务器 if slave_flag: for leader in self.spider_service_instances_master: serviec_job_id = leader.start_spider(project.project_name, spider_name_master) # 如果启动成功 if serviec_job_id: job_execution = JobExecution() job_execution.project_id = job_instance.project_id job_execution.service_job_execution_id = serviec_job_id+'>'+serviec_job_id_slave_str job_execution.job_instance_id = job_instance.id job_execution.create_time = datetime.datetime.now() job_execution.running_on = leader.server+'>'+running_on_slave_str db.session.add(job_execution) db.session.commit() break