Beispiel #1
0
    def parse_index(self, response):
        """
        extract books and get next page
        :param response:
        :return:
        """
        items = response.css('.item')
        for item in items:
            href = item.css('.top a::attr(href)').extract_first()
            detail_url = response.urljoin(href)
            yield SeleniumRequest(detail_url,
                                  callback=self.parse_detail,
                                  wait_for='.item .name',
                                  priority=2,
                                  screenshot={'selector': '.item'})

        # next page
        match = re.search(r'page/(\d+)', response.url)
        if not match: return
        page = int(match.group(1)) + 1
        next_url = f'{self.base_url}/page/{page}'
        yield SeleniumRequest(
            next_url,
            callback=self.parse_index,
            wait_for='.item .name',
        )
 def start_requests(self):
     """
     first page
     :return:
     """
     start_url = f'{self.base_url}/page/1'
     logger.info('crawling %s', start_url)
     yield SeleniumRequest(start_url,
                           callback=self.parse_index,
                           wait_for='.item .name')
Beispiel #3
0
 def parse_index(self, response):
     """
     extract movies
     :param response:
     :return:
     """
     items = response.css('.item')
     for item in items:
         href = item.css('a::attr(href)').extract_first()
         detail_url = response.urljoin(href)
         logger.info('detail url %s', detail_url)
         yield SeleniumRequest(detail_url,
                               callback=self.parse_detail,
                               wait_for='.item')
Beispiel #4
0
 def start_requests(self):
     """
     first page
     :return:
     """
     for page in range(1, self.max_page + 1):
         url = f'{self.base_url}/page/{page}'
         logger.debug('start url %s', url)
         cookies = {'name': 'germey'}
         yield SeleniumRequest(url,
                               callback=self.parse_index,
                               priority=10,
                               wait_for='.item',
                               pretend=True,
                               cookies=cookies)