Esempio n. 1
0
 def start(self):
     try:
         self._processor.logger.info("START %s SUCCESS" % self._spider_id)
         self._spider_status = 'start'
         self._queue = PriorityQueue(self._processor)
         # print(self._processor.spider_start_requests)
         if not self._processor.start_requests:
             self._processor.init_start_requests()
         print('START_REQUESTS COUNT ::: %s' %
               str(len(self._processor.start_requests)))
         for start_request in self._processor.start_requests:
             if self._should_follow(start_request):
                 start_request.duplicate_remove = False
                 self._queue.push(start_request)
                 self._processor.logger.info("start request:>>>>>>" +
                                             str(start_request))
         # 去除所有添加到queue的start_request, 防止污染后续的processor
         self._processor.start_requests.clear()
         for batch in self._batch_requests():
             if len(batch) > 0:
                 self._crawl(batch)
                 if self.test:
                     if self._process_count > 0:
                         return
             if self._spider_status == 'stopping':
                 break
         self._spider_status = 'stopped'
         self._processor.logger.info("STOP %s SUCCESS" % self._spider_id)
     except Exception:
         self._processor.logger.info(
             "%s -- Exception -- Stopped -- %s" %
             (self._spider_id, traceback.format_exc()))
         self._spider_status = 'stopped'
Esempio n. 2
0
    def start(self):
        try:
            FetchManLogger.logger.info("START %s SUCCESS" % self._spider_id)
            self._spider_status = 'start'
            # 启动爬虫记录日志

            self._queue = PriorityQueue(self._processor)
            if not self._processor.start_requests:
                self._processor.init_start_requests()
            for start_request in self._processor.start_requests:
                if self._should_follow(start_request):
                    start_request.duplicate_remove = False
                    self._queue.push(start_request)
                    FetchManLogger.logger.info("start request:" +
                                               str(start_request))
            for batch in self._batch_requests():
                print(batch)
                if len(batch) > 0:
                    self._crawl(batch)
                    if self.test:
                        if self._process_count > 0:
                            return
                if self._spider_status == 'stopping':
                    break
            # 循环结束,记录停止日志

            self._process_count = 0
            self._spider_status = 'stopped'
            FetchManLogger.logger.info("STOP %s SUCCESS" % self._spider_id)
        except Exception:
            FetchManLogger.logger.info(
                "%s -- Exception -- Stopped -- %s" %
                (self._spider_id, traceback.format_exc()))
            self._spider_status = 'stopped'
Esempio n. 3
0
    def __init__(self, processor=None, downloader=None, use_proxy=False, scheduler=None, batch_size=None,
                 time_sleep=None, test=False):
        # 用于测试,爬取成功第一个以后结束
        self.test = test
        self._processor = processor
        FetchManLogger.init_logger(processor.spider_id)
        self._host_regex = self._get_host_regex()
        self._spider_status = 'stopped'
        self._pipelines = {}
        self._time_sleep = time_sleep
        if time_sleep:
            self._batch_size = 0
        else:
            if isinstance(downloader, SeleniumDownLoader):
                self._batch_size = default_settings.DRIVER_POOL_SIZE - 1
            else:
                if batch_size:
                    self._batch_size = batch_size - 1
                else:
                    self._batch_size = 9
        self._spider_id = processor.spider_id
        self._process_count = 0

        if not downloader:
            self._downloader = RequestsDownLoader(use_proxy=use_proxy)
        elif isinstance(downloader,SeleniumDownLoader):
            self._downloader = downloader
            self._batch_size = default_settings.DRIVER_POOL_SIZE - 1
        else:
            self._downloader = downloader

        if not scheduler:
            self._queue = PriorityQueue(self._processor)
        else:
            self._queue = scheduler
Esempio n. 4
0
 def push_request(cls, requests):
     if isinstance(requests, list):
         queue = PriorityQueue(cls)
         for request in requests:
             if isinstance(request,Request):
                 request.duplicate_remove = False
                 queue.push(request)
                 FetchManLogger.logger.info("push request to queue :" + str(request))
             else:
                 FetchManLogger.logger.info("param is not Request!")
     elif isinstance(requests, Request):
         queue = PriorityQueue(cls)
         requests.duplicate_remove = False
         queue.push(requests)
         FetchManLogger.logger.info("push request to queue :" + str(requests))
     else:
         FetchManLogger.logger.info("param is not Request!")
Esempio n. 5
0
class SpiderCore(object):
    def __init__(self,
                 processor=None,
                 downloader=None,
                 use_proxy=False,
                 scheduler=None,
                 batch_size=None,
                 time_sleep=None,
                 test=False):
        # 用于测试,爬取成功第一个以后结束
        self.test = test
        self._processor = processor
        self._host_regex = self._get_host_regex()
        self._spider_status = 'stopped'
        # self._pipelines = {}
        self._time_sleep = time_sleep
        self._batch_size = 1
        if time_sleep:
            self._batch_size = 1
        else:
            if isinstance(downloader, SeleniumDownLoader):
                self._batch_size = default_settings.DRIVER_POOL_SIZE - 1
            else:
                if batch_size:
                    self._batch_size = batch_size
                else:
                    self._batch_size = 10
        self._spider_id = processor.spider_id
        self._process_count = 0

        if not downloader:
            self._downloader = RequestsDownLoader(use_proxy=use_proxy)
        elif isinstance(downloader, SeleniumDownLoader):
            self._downloader = downloader
            self._batch_size = default_settings.DRIVER_POOL_SIZE - 1
        else:
            self._downloader = downloader

        if not scheduler:
            self._queue = PriorityQueue(self._processor)
        else:
            self._queue = scheduler

    def create(self, processor):
        self._processor = processor
        return self

    def set_scheduler(self, scheduler):
        self._queue = scheduler
        return self

    def set_downloader(self, downloader):
        self._downloader = downloader
        if isinstance(downloader, SeleniumDownLoader):
            self._batch_size = default_settings.DRIVER_POOL_SIZE - 1
        return self

    def stop(self):
        if self._spider_status == 'stopped':
            self._processor.logger.info("STOP %s SUCCESS" % self._spider_id)
            return
        elif self._spider_status == 'stopping':
            while self._spider_status == 'stopping':
                pass
        elif self._spider_status == 'start':
            self._spider_status = 'stopping'
            while self._spider_status == 'stopping':
                pass

    def start(self):
        try:
            self._processor.logger.info("START %s SUCCESS" % self._spider_id)
            self._spider_status = 'start'
            self._queue = PriorityQueue(self._processor)
            # print(self._processor.spider_start_requests)
            if not self._processor.start_requests:
                self._processor.init_start_requests()
            print('START_REQUESTS COUNT ::: %s' %
                  str(len(self._processor.start_requests)))
            for start_request in self._processor.start_requests:
                if self._should_follow(start_request):
                    start_request.duplicate_remove = False
                    self._queue.push(start_request)
                    self._processor.logger.info("start request:>>>>>>" +
                                                str(start_request))
            # 去除所有添加到queue的start_request, 防止污染后续的processor
            self._processor.start_requests.clear()
            for batch in self._batch_requests():
                if len(batch) > 0:
                    self._crawl(batch)
                    if self.test:
                        if self._process_count > 0:
                            return
                if self._spider_status == 'stopping':
                    break
            self._spider_status = 'stopped'
            self._processor.logger.info("STOP %s SUCCESS" % self._spider_id)
        except Exception:
            self._processor.logger.info(
                "%s -- Exception -- Stopped -- %s" %
                (self._spider_id, traceback.format_exc()))
            self._spider_status = 'stopped'

    def restart(self):
        self._queue = PriorityQueue(self._processor)
        self._queue.clear()
        self.start()

    def _batch_requests(self):
        batch = []
        count = 0
        retry = 0
        try:
            while True:
                count += 1
                if len(batch) >= self._batch_size or count >= self._batch_size:
                    batch.sort(key=_priority_compare_key, reverse=True)
                    yield batch
                    batch = []
                    count = 0
                temp_request = self._queue.pop()
                queue_count = len(self._queue)
                if temp_request is not None:
                    if not temp_request.callback:
                        temp_request.callback = self._processor.process
                    batch.append(temp_request)
                elif len(batch) == 0 and queue_count == 0:  # 等待几次,看队列中是否还有
                    time.sleep(self._time_sleep)
                    retry += 1
                    if retry >= 3:
                        return []
        except KeyboardInterrupt:
            pass

    def _crawl(self, batch):
        responses = self._downloader.download(batch)
        if self._time_sleep:
            time.sleep(self._time_sleep)
        for response in responses:
            self._processor.logger.info(response)
            callback = response.request.callback(response)
            if isinstance(callback, types.GeneratorType):
                pipe = self._queue.get_pipe()
                for item in callback:
                    if isinstance(item, Request):
                        # logger.info("push request to queue..." + str(item))
                        if self._should_follow(item):
                            self._queue.push_pipe(item, pipe)
                    elif isinstance(item, PipeItem):
                        # 如果返回对象是pipeItem,则用对应的pipeline处理
                        self._process_count += 1
                        for pipe_name in item.pipe_names:
                            queue_job(PIPELINE_TASK,
                                      PipelineArgs(pipe_name, item.result),
                                      queue=PIPELINE)
                        if self.test:
                            if self._process_count > 0:
                                return
                    elif isinstance(
                            item,
                            Violet):  # 如果返回的是tuple,即详情页的processor和详情页的请求信息
                        queue_job(CRAWLER_TASK,
                                  CrawlArgs(item.processor, item.request),
                                  queue=CRAWLER)
                    else:
                        raise Exception('not return correct value!!!')
                pipe.execute()
            elif isinstance(callback, Request):
                # logger.info("push request to queue..." + str(back))
                if self._should_follow(callback):
                    self._queue.push(callback)
            elif isinstance(callback, PipeItem):
                # 如果返回对象是pipeItem,则用对应的pipeline处理
                self._process_count += 1
                for pipe_name in callback.pipe_names:
                    queue_job(PIPELINE_TASK,
                              PipelineArgs(pipe_name, callback.result),
                              queue=PIPELINE)
            elif isinstance(callback,
                            Violet):  # 如果返回的是tuple,即详情页的processor和详情页的请求信息
                queue_job(CRAWLER_TASK,
                          CrawlArgs(item.processor, item.request),
                          queue=CRAWLER)
            else:
                # # 如果返回对象不是pipeItem,则默认用每个pipeline处理
                raise Exception('not return correct value!!!')

    def _should_follow(self, request):
        regex = self._host_regex
        # hostname can be None for wrong urls (like javascript links)
        host = urlparse_cached(request).hostname or ''
        return bool(regex.search(host))

    def _get_host_regex(self):
        """Override this method to implement a different offsite policy"""
        allowed_domains = getattr(self._processor, 'allowed_domains', None)
        if not allowed_domains:
            return re.compile('')  # allow all by default
        regex = r'^(.*\.)?(%s)$' % '|'.join(
            re.escape(d) for d in allowed_domains if d is not None)
        return re.compile(regex)
Esempio n. 6
0
 def restart(self):
     self._queue = PriorityQueue(self._processor)
     self._queue.clear()
     self.start()
Esempio n. 7
0
class SpiderCore(object):
    def __init__(self,
                 processor=None,
                 downloader=None,
                 use_proxy=False,
                 scheduler=None,
                 batch_size=None,
                 time_sleep=None,
                 test=False):
        # 用于测试,爬取成功第一个以后结束
        self.test = test
        self._processor = processor
        FetchManLogger.init_logger(processor.spider_id)
        self._host_regex = self._get_host_regex()
        self._spider_status = 'stopped'
        self._pipelines = {}
        self._time_sleep = time_sleep
        if time_sleep:
            self._batch_size = 0
        else:
            if isinstance(downloader, SeleniumDownLoader):
                self._batch_size = default_settings.DRIVER_POOL_SIZE - 1
            else:
                if batch_size:
                    self._batch_size = batch_size - 1
                else:
                    self._batch_size = 9
        self._spider_id = processor.spider_id
        self._process_count = 0

        if not downloader:
            self._downloader = RequestsDownLoader(use_proxy=use_proxy)
        elif isinstance(downloader, SeleniumDownLoader):
            self._downloader = downloader
            self._batch_size = default_settings.DRIVER_POOL_SIZE - 1
        else:
            self._downloader = downloader

        if not scheduler:
            self._queue = PriorityQueue(self._processor)
        else:
            self._queue = scheduler

    def create(self, processor):
        self._processor = processor
        return self

    def set_scheduler(self, scheduler):
        self._queue = scheduler
        return self

    def set_downloader(self, downloader):
        self._downloader = downloader
        if isinstance(downloader, SeleniumDownLoader):
            self._batch_size = default_settings.DRIVER_POOL_SIZE - 1
        return self

    def set_pipeline(
        self,
        pipeline=None,
        pipeline_name=None,
    ):
        if not pipeline_name:
            pipeline_name = str(uuid.uuid1())
        self._pipelines[pipeline_name] = pipeline
        return self

    def stop(self):
        if self._spider_status == 'stopped':
            FetchManLogger.logger.info("STOP %s SUCCESS" % self._spider_id)
            return
        elif self._spider_status == 'stopping':
            while self._spider_status == 'stopping':
                pass
        elif self._spider_status == 'start':
            self._spider_status = 'stopping'
            while self._spider_status == 'stopping':
                pass

    def start(self):
        try:
            FetchManLogger.logger.info("START %s SUCCESS" % self._spider_id)
            self._spider_status = 'start'
            # 启动爬虫记录日志

            self._queue = PriorityQueue(self._processor)
            if not self._processor.start_requests:
                self._processor.init_start_requests()
            for start_request in self._processor.start_requests:
                if self._should_follow(start_request):
                    start_request.duplicate_remove = False
                    self._queue.push(start_request)
                    FetchManLogger.logger.info("start request:" +
                                               str(start_request))
            for batch in self._batch_requests():
                print(batch)
                if len(batch) > 0:
                    self._crawl(batch)
                    if self.test:
                        if self._process_count > 0:
                            return
                if self._spider_status == 'stopping':
                    break
            # 循环结束,记录停止日志

            self._process_count = 0
            self._spider_status = 'stopped'
            FetchManLogger.logger.info("STOP %s SUCCESS" % self._spider_id)
        except Exception:
            FetchManLogger.logger.info(
                "%s -- Exception -- Stopped -- %s" %
                (self._spider_id, traceback.format_exc()))
            self._spider_status = 'stopped'

    def restart(self):
        self._queue = PriorityQueue(self._processor)
        self._queue.clear()
        self.start()

    def _batch_requests(self):
        batch = []
        count = 0
        while True:
            count += 1
            temp_request = self._queue.pop()
            if temp_request:
                if not temp_request.callback:
                    temp_request.callback = self._processor.process
                batch.append(temp_request)
            if len(batch) > self._batch_size or count > self._batch_size:
                if sys.version_info < (3, 0):
                    batch.sort(_priority_compare)
                else:
                    batch.sort(key=_priority_compare_key, reverse=True)
                if len(batch) > 0:
                    yield batch
                else:
                    break
                batch = []
                count = 0

    def _crawl(self, batch):
        responses = self._downloader.download(batch)
        if self._time_sleep:
            time.sleep(self._time_sleep)
        for response in responses:
            callback = response.request.callback(response)
            if isinstance(callback, types.GeneratorType):
                pipe = self._queue.get_pipe()
                for item in callback:
                    if isinstance(item, Request):
                        # logger.info("push request to queue..." + str(item))
                        if self._should_follow(item):
                            self._queue.push_pipe(item, pipe)
                    else:
                        if isinstance(item, pipeItem):
                            # 如果返回对象是pipeItem,则用对应的pipeline处理
                            self._process_count += 1
                            for pipename in item.pipenames:
                                if pipename in self._pipelines:
                                    self._pipelines[pipename].process_item(
                                        item.result)
                            if self.test:
                                if self._process_count > 0:
                                    return
                        else:
                            # 如果返回对象不是pipeItem,则默认用每个pipeline处理
                            self._process_count += 1
                            for pipeline in self._pipelines.itervalues():
                                pipeline.process_item(item)
                            if self.test:
                                if self._process_count > 0:
                                    return
                pipe.execute()
            elif isinstance(callback, Request):
                # logger.info("push request to queue..." + str(back))
                if self._should_follow(callback):
                    self._queue.push(callback)
            elif isinstance(callback, pipeItem):
                # 如果返回对象是pipeItem,则用对应的pipeline处理
                self._process_count += 1
                for pipename in callback.pipenames:
                    if pipename in self._pipelines:
                        self._pipelines[pipename].process_item(callback.result)
            else:
                # 如果返回对象不是pipeItem,则默认用每个pipeline处理
                self._process_count += 1
                for pipeline in self._pipelines.itervalues:
                    pipeline.process_item(item)
                if self.test:
                    if self._process_count > 0:
                        return

    def _should_follow(self, request):
        regex = self._host_regex
        # hostname can be None for wrong urls (like javascript links)
        host = urlparse_cached(request).hostname or ''
        return bool(regex.search(host))

    def _get_host_regex(self):
        """Override this method to implement a different offsite policy"""
        allowed_domains = getattr(self._processor, 'allowed_domains', None)
        if not allowed_domains:
            return re.compile('')  # allow all by default
        regex = r'^(.*\.)?(%s)$' % '|'.join(
            re.escape(d) for d in allowed_domains if d is not None)
        return re.compile(regex)
Esempio n. 8
0
 def push_start_request(cls):
     queue = PriorityQueue(cls)
     for start_request in cls.start_requests:
         start_request.duplicate_remove = False
         queue.push(start_request)
         FetchManLogger.logger.info("push start request to queue :" + str(start_request))