Exemple #1
0
 def processing_products(self, spider):
     if self != spider or self.skus_parsed:
         return
     request = Request(self.start_urls[0], callback=self.yield_products, dont_filter=True)
     self.crawler.engine.crawl(request, spider)
     raise DontCloseSpider()
Exemple #2
0
    def spider_idle(self, spider):
        for request in self._read_queue():
            self.crawler.engine.crawl(request, self)

        raise DontCloseSpider()
 def spider_idle(cls, spider):
     if cls.requests.get(spider):
         spider.log("delayed requests pending, not closing spider")
         raise DontCloseSpider()
Exemple #4
0
 def keep_spider_alive(self, spider):
     raise DontCloseSpider("keeping it open")
Exemple #5
0
 def _spider_idle(self):
     self._schedule_next_request()
     raise DontCloseSpider()
Exemple #6
0
 def _logout(self, spider):
     if spider != self: return
     if self.logout_done: return
     if not self.logout_url: return
     self.crawler.engine.schedule(self.logout(), spider)
     raise DontCloseSpider('logout scheduled')
 def spider_idle(self, spider):
     if spider != self: return
     if self.items:
         self.crawler.engine.schedule(Request('http://' + self.allowed_domains[0], callback=self.yield_product, dont_filter=True), spider)
         raise DontCloseSpider('Found pending requests')
Exemple #8
0
 def spider_idle(self, spider):
     raise DontCloseSpider('waiting for process')
    def parse_article(self, response):

        comment_item = CommentItem()
        post_item = PostItem()

        # define the next page url
        try:
            next_page = response.xpath(
                "//div[@id='action-bar-container']//a[contains(text(), '上頁')]/@href"
            )[0]
        except:
            next_page = None
            logging.error('Cannot load the next page')
        # see the number of articles
        try:
            article_list = response.css('.r-list-container > div')
        except:
            logging.error('There is not exist any article being loaded')
        else:
            logging.info('the number of articles is {}'.format(
                len(article_list)))
            # check all tags of div
            while len(article_list) > 0:
                div = article_list.pop(0)
                try:
                    # get class name inside div tag
                    slot_name = div.xpath('@class')[0].extract()
                    # get url inside article respectively
                    canonicalUrl = response.urljoin(
                        div.css('.title a::attr(href)')[0].extract())
                    # get author inside article respectively
                    author_str = div.css('.author::text')[0].extract()
                except:
                    logging.error('Fail to access url, author and classes')

                else:
                    if slot_name == 'r-list-sep':
                        '''
                        Once receiving class='r-list-sep', we are going to crawl the next page.
                        '''
                        if next_page:  # exists the next page
                            logging.warning('redirect to following {}'.format(
                                canonicalUrl))
                            yield scrapy.Request(canonicalUrl,
                                                 callback=self.parse_article,
                                                 dont_filter=True)
                        else:  # there is no following page, we stop the spider and wait for a new request
                            raise CloseSpider('page exceeded')
                        break  # Then we needn't search the following

                    else:  # r-ent' or 'search-bar':
                        '''
                            load each article url and find information such as content, comment, author, etc..
                        '''
                        if slot_name != 'r-ent': continue  # 'search-bar'

                        ## try to access articles which match the period we defined
                        date_str = div.css('.date::text')[0].extract()
                        m_d = tuple(map(int, date_str.split('/')))
                        m_d = datetime(self.year, m_d[0], m_d[1])
                        in_period = self.start <= m_d <= self.end
                        logging.debug(
                            'the date is {}/{}, and reach or not {}'.format(
                                m_d.month, m_d.day, in_period))

                        if in_period:
                            # First get items in an article before loading url inside
                            try:
                                post_item['canonicalUrl'] = canonicalUrl
                                comment_item['canonicalUrl'] = canonicalUrl
                                post_item['authorId'] = author_str
                                post_item['title'] = div.css(
                                    '.title a::text')[0].extract()
                                post_item['publishedTime'] = date_str
                                post_item['board'] = self.board

                            except IndexError:
                                logging.error('Fail to save object (postItem)')

                            # This part we try to get in url corresponding to article
                            try:
                                url = response.urljoin(
                                    div.css('.title a::attr(href)')
                                    [0].extract())
                                logging.info(
                                    'load url inside every article:{}'.format(
                                        url))

                                yield scrapy.Request(
                                    url,
                                    meta={
                                        'post': post_item,
                                        'comment': comment_item
                                    },
                                    callback=self.parse_comment,
                                    dont_filter=True)

                            except IndexError:
                                logging.error('Cannot load article')
                        else:
                            self.maximum_missing_count -= 1

                if len(article_list) == 0:
                    '''
                    Let's go to the next page since we have explored every article
                    '''
                    if next_page and self.maximum_missing_count > 0:  # exists the next page
                        url = response.urljoin(next_page.extract())
                        logging.warning('redirect to following {}'.format(url))
                        yield scrapy.Request(url,
                                             callback=self.parse_article,
                                             dont_filter=True)
                    else:  # there is no following page, we stop the spider and wait for a new request
                        logging.error('Without the next page')
                        # endless running
                        raise DontCloseSpider('page exceeded')
Exemple #10
0
 def _dont_close_me(self, spider):
     raise DontCloseSpider("..I prefer live spiders.")
Exemple #11
0
 def spider_idle(self, spider):
     """当spider收到idle信号时, 去ZooKeeper上获取一个任务, 再抛出DontCloseSpider异常"""
     self.request_new_job(spider)
     raise DontCloseSpider()
Exemple #12
0
 def signal_dispatcher(self, signal):
     if self.signals_callback:
         if signal == signals.spider_idle or signal == signals.spider_error:
             raise DontCloseSpider('I prefer live spiders')
         elif signal == signals.spider_opened:
             self.signals_callback(signal, spider=self)
Exemple #13
0
 def leftover_requests(self, spider):
     if self.resume_iter:
         self.defrost_in_batch(spider)
         raise DontCloseSpider()
 def spider_idle(self, spider):
     if self.rountine_interval:
         self.schedule_rountine_requests(spider)
         raise DontCloseSpider()