Beispiel #1
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     arguments = {}
     if job_instance.spider_arguments:
         arguments = dict(
             map(lambda x: x.split("="),
                 job_instance.spider_arguments.split(",")))
     threshold = 0
     daemon_size = len(self.spider_service_instances)
     if job_instance.priority == JobPriority.HIGH:
         threshold = int(daemon_size / 2)
     if job_instance.priority == JobPriority.HIGHEST:
         threshold = int(daemon_size)
     threshold = 1 if threshold == 0 else threshold
     candidates = self.spider_service_instances
     leaders = []
     # TODO optimize some better func to vote the leader
     for i in range(threshold):
         leaders.append(random.choice(candidates))
     for leader in leaders:
         serviec_job_id = leader.start_spider(project.project_name,
                                              spider_name, arguments)
         job_execution = JobExecution()
         job_execution.project_id = job_instance.project_id
         job_execution.service_job_execution_id = serviec_job_id
         job_execution.job_instance_id = job_instance.id
         job_execution.create_time = datetime.datetime.now()
         job_execution.running_on = leader.server
         db.session.add(job_execution)
         db.session.commit()
Beispiel #2
0
    def create_job_execution(self, job, project_id):
        from SpiderKeeper.app.spider.model import JobExecution, JobInstance, JobRunType
        from SpiderKeeper.app import agent
        from SpiderKeeper.app import db

        execution_id = job.get('id', 0)

        if JobExecution.query.filter_by(
                service_job_execution_id=execution_id).first():
            return

        job_instance = JobInstance()
        job_instance.spider_name = job.get('spider', 'unknown')
        job_instance.project_id = project_id
        job_instance.spider_arguments = ''
        job_instance.priority = 0
        job_instance.run_type = JobRunType.ONETIME
        db.session.add(job_instance)
        db.session.commit()

        job_execution = JobExecution()
        job_execution.project_id = project_id
        job_execution.service_job_execution_id = execution_id
        job_execution.job_instance_id = 0
        job_execution.create_time = self.convert_time(job, 'start_time')
        job_execution.end_time = self.convert_time(job, 'end_time')
        job_execution.running_on = agent.spider_service_instances[0].server
        job_execution.job_instance = job_instance
        job_execution.job_instance_id = job_instance.id
        db.session.add(job_execution)
        db.session.commit()
Beispiel #3
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     # job_execution = JobExecution.find_job_by_service_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     arguments = {}
     if job_instance.spider_arguments:
         arguments = dict(
             map(lambda x: x.split("="),
                 job_instance.spider_arguments.split(",")))
     threshold = 0
     daemon_size = len(self.spider_service_instances)
     if job_instance.priority == JobPriority.HIGH:
         threshold = int(daemon_size / 2)
     if job_instance.priority == JobPriority.HIGHEST:
         threshold = int(daemon_size)
     threshold = 1 if threshold == 0 else threshold
     candidates = self.spider_service_instances
     leaders = []
     if 'daemon' in arguments:
         for candidate in candidates:
             if candidate.server == arguments['daemon']:
                 leaders = [candidate]
     else:
         # TODO optimize some better func to vote the leader
         for i in range(threshold):
             leaders.append(random.choice(candidates))
     for leader in leaders:
         # add more arguments to scrapyd to run a spider
         arguments['project_id'] = job_instance.project_id
         arguments['project_name'] = project.project_name
         arguments['job_instance_id'] = job_instance.job_instance_id
         arguments['priority'] = job_instance.priority
         arguments['args'] = job_instance.spider_arguments
         arguments['execute_ip'] = leader.server
         arguments['create_time'] = datetime.datetime.now()
         serviec_job_id = leader.start_spider(project.project_name,
                                              spider_name, arguments)
         job_execution = JobExecution()
         job_execution.project_id = job_instance.project_id
         job_execution.service_job_execution_id = serviec_job_id
         job_execution.job_instance_id = job_instance.job_instance_id
         job_execution.create_time = arguments['create_time']
         job_execution.running_on = leader.server
         db.session.add(job_execution)
         db.session.commit()
Beispiel #4
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     #arguments = {}
     #if job_instance.spider_arguments:
     #    arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
     from collections import defaultdict
     arguments = defaultdict(list)
     if job_instance.spider_arguments:
         for k, v in list(map(lambda x: x.split('=', 1), job_instance.spider_arguments.split(','))):
             arguments[k].append(v)
     # threshold = 0
     # daemon_size = len(self.spider_service_instances)
     # if job_instance.priority == JobPriority.HIGH:
     #     threshold = int(daemon_size / 2)
     # if job_instance.priority == JobPriority.HIGHEST:
     #     threshold = int(daemon_size)
     # threshold = 1 if threshold == 0 else threshold
     threshold = 1
     candidates = self.spider_service_instances
     leaders = []
     if 'daemon' in arguments:
         for candidate in candidates:
             if candidate.server == arguments['daemon']:
                 leaders = [candidate]
     else:
         # TODO optimize some better func to vote the leader
         for i in range(threshold):
             leaders.append(random.choice(candidates))
     for leader in leaders:
         serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments)
         job_execution = JobExecution()
         job_execution.project_id = job_instance.project_id
         job_execution.service_job_execution_id = serviec_job_id
         job_execution.job_instance_id = job_instance.id
         job_execution.create_time = datetime.datetime.now()
         job_execution.running_on = leader.server
         try:
             db.session.add(job_execution)
             db.session.commit()
         except:
             db.session.rollback()
             raise
Beispiel #5
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     #arguments = {}
     #if job_instance.spider_arguments:
     #    arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
     from collections import defaultdict
     arguments = defaultdict(list)
     if job_instance.spider_arguments:
         for k, v in list(map(lambda x: x.split('=', 1), job_instance.spider_arguments.split(','))):
             arguments[k].append(v)
     threshold = 0
     daemon_size = len(self.spider_service_instances)
     if job_instance.priority == JobPriority.HIGH:
         threshold = int(daemon_size / 2)
     if job_instance.priority == JobPriority.HIGHEST:
         threshold = int(daemon_size)
     threshold = 1 if threshold == 0 else threshold
     candidates = self.spider_service_instances
     leaders = []
     if 'daemon' in arguments:
         for candidate in candidates:
             if candidate.server == arguments['daemon']:
                 leaders = [candidate]
     else:
         # TODO optimize some better func to vote the leader
         for i in range(threshold):
             leaders.append(random.choice(candidates))
     for leader in leaders:
         serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments)
         job_execution = JobExecution()
         job_execution.project_id = job_instance.project_id
         job_execution.service_job_execution_id = serviec_job_id
         job_execution.job_instance_id = job_instance.id
         job_execution.create_time = datetime.datetime.now()
         job_execution.running_on = leader.server
         db.session.add(job_execution)
         db.session.commit()
Beispiel #6
0
 def start_spider(self, job_instance):
     project = Project.find_project_by_id(job_instance.project_id)
     spider_name = job_instance.spider_name
     task_id = job_instance.id
     if job_instance.keywords is not None:
         keywords_list = job_instance.keywords.strip(',').split(',')
         for keywords in keywords_list:
             arguments = {}
             if job_instance.spider_arguments:
                 arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
             threshold = 0       # 阈值
             leaders = []
             arguments['keywords'] = keywords
             arguments['video_time_short'] = job_instance.video_time_short
             arguments['video_time_long'] = job_instance.video_time_long
             if job_instance.upload_time_type == '设定区间':                 # 任务运行周期内自动设定最优时间参数
                 arguments['startDate'] = dts2ts(job_instance.upload_time_start_date)
                 arguments['endDate'] = dts2ts(job_instance.upload_time_end_date)
             else:
                 arguments['startDate'] = int(time.time()) - 3600*24*job_instance.spider_freq - 3600*24
                 arguments['endDate'] = int(time.time())
             arguments['task_id'] = task_id   # 将任务id加入到爬虫
             daemon_size = len(self.spider_service_instances)
             if job_instance.priority == JobPriority.HIGH:
                 threshold = int(daemon_size / 2)
             if job_instance.priority == JobPriority.HIGHEST:
                 threshold = int(daemon_size)
             threshold = 1 if threshold == 0 else threshold
             candidates = self.spider_service_instances
             if 'daemon' in arguments:
                 for candidate in candidates:
                     if candidate.server == arguments['daemon']:
                         leaders = [candidate]
             else:
                 # TODO optimize some better func to vote the leader
                 for i in range(threshold):
                     leaders.append(random.choice(candidates))
             for leader in leaders:
                 print(project.project_name, spider_name, arguments)
                 serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments)
                 job_execution = JobExecution()
                 job_execution.project_id = job_instance.project_id
                 job_execution.service_job_execution_id = serviec_job_id
                 job_execution.job_instance_id = job_instance.id
                 job_execution.create_time = datetime.datetime.now()
                 job_execution.running_on = leader.server
                 db.session.add(job_execution)
                 db.session.commit()
     else:
         arguments = {}
         if job_instance.spider_arguments:
             arguments = dict(map(lambda x: x.split("="), job_instance.spider_arguments.split(",")))
         threshold = 0  # 阈值
         leaders = []
         arguments['video_time_short'] = job_instance.video_time_short
         arguments['video_time_long'] = job_instance.video_time_long
         arguments['startDate'] = dts2ts(job_instance.upload_time_start_date)
         arguments['endDate'] = dts2ts(job_instance.upload_time_end_date)
         arguments['task_id'] = task_id  # 将任务id加入到爬虫
         daemon_size = len(self.spider_service_instances)
         if job_instance.priority == JobPriority.HIGH:
             threshold = int(daemon_size / 2)
         if job_instance.priority == JobPriority.HIGHEST:
             threshold = int(daemon_size)
         threshold = 1 if threshold == 0 else threshold
         candidates = self.spider_service_instances
         if 'daemon' in arguments:
             for candidate in candidates:
                 if candidate.server == arguments['daemon']:
                     leaders = [candidate]
         else:
             # TODO optimize some better func to vote the leader
             for i in range(threshold):
                 leaders.append(random.choice(candidates))
         for leader in leaders:
             print(project.project_name, spider_name, arguments)
             serviec_job_id = leader.start_spider(project.project_name, spider_name, arguments)
             job_execution = JobExecution()
             job_execution.project_id = job_instance.project_id
             job_execution.service_job_execution_id = serviec_job_id
             job_execution.job_instance_id = job_instance.id
             job_execution.create_time = datetime.datetime.now()
             job_execution.running_on = leader.server
             db.session.add(job_execution)
             db.session.commit()
 def start_spider(self, job_instance):
     """
     功能: 启动爬虫,首先启动从爬虫, 至少有一个从爬虫启动成功后启动主爬虫
     :param job_instance: job_instance对象
     :return: None
     """
     project = Project.find_project_by_id(job_instance.project_id)
     if project.is_msd == '0':  # 如果是单机爬虫
         spider_name = job_instance.spider_name
         for leader in self.spider_service_instances_master:
             serviec_job_id = leader.start_spider(project.project_name, spider_name)
             # 如果启动成功
             if serviec_job_id:
                 job_execution = JobExecution()
                 job_execution.project_id = job_instance.project_id
                 job_execution.service_job_execution_id = serviec_job_id + '>'
                 job_execution.job_instance_id = job_instance.id
                 job_execution.create_time = datetime.datetime.now()
                 job_execution.running_on = leader.server + '>'
                 db.session.add(job_execution)
                 db.session.commit()
                 break
     else:
         # 主爬虫名
         spider_name_master = job_instance.spider_name
         spider_instance = SpiderInstance.query.filter_by(
             project_id=job_instance.project_id, spider_name=spider_name_master).first()
         # 从爬虫名
         spider_name_slave = spider_instance.spider_name_slave
         # 启动从爬虫服务器启动成功标志
         slave_flag = False
         # 从爬虫的job执行列表
         serviec_job_id_slave = []
         # 从爬虫运行的服务器列表
         running_on_slave = []
         # 遍历从爬虫服务器
         for leader in self.spider_service_instances_slave:
             # 启动爬虫, 爬虫启动成功,返回id, 否则返回None
             serviec_job_id = leader.start_spider(project.project_name, spider_name_slave)
             # 如果启动成功
             if serviec_job_id:
                 # 标志为True
                 slave_flag = True
                 # job_id添加到列表, 为日志获取提供数据
                 serviec_job_id_slave.append(serviec_job_id)
                 # 运行的服务器添加到列表, 为日志获取提供数据
                 running_on_slave.append(leader.server)
         # 将列表转换为字符串
         serviec_job_id_slave_str = ','.join(serviec_job_id_slave)
         running_on_slave_str = ','.join(running_on_slave)
         # 从爬虫服务器至少有一个启动成功,则启动主爬虫服务器
         if slave_flag:
             for leader in self.spider_service_instances_master:
                 serviec_job_id = leader.start_spider(project.project_name, spider_name_master)
                 # 如果启动成功
                 if serviec_job_id:
                     job_execution = JobExecution()
                     job_execution.project_id = job_instance.project_id
                     job_execution.service_job_execution_id = serviec_job_id+'>'+serviec_job_id_slave_str
                     job_execution.job_instance_id = job_instance.id
                     job_execution.create_time = datetime.datetime.now()
                     job_execution.running_on = leader.server+'>'+running_on_slave_str
                     db.session.add(job_execution)
                     db.session.commit()
                     break