def parse_index(self, response): """ extract books and get next page :param response: :return: """ items = response.css('.item') for item in items: href = item.css('.top a::attr(href)').extract_first() detail_url = response.urljoin(href) yield SeleniumRequest(detail_url, callback=self.parse_detail, wait_for='.item .name', priority=2, screenshot={'selector': '.item'}) # next page match = re.search(r'page/(\d+)', response.url) if not match: return page = int(match.group(1)) + 1 next_url = f'{self.base_url}/page/{page}' yield SeleniumRequest( next_url, callback=self.parse_index, wait_for='.item .name', )
def start_requests(self): """ first page :return: """ start_url = f'{self.base_url}/page/1' logger.info('crawling %s', start_url) yield SeleniumRequest(start_url, callback=self.parse_index, wait_for='.item .name')
def parse_index(self, response): """ extract movies :param response: :return: """ items = response.css('.item') for item in items: href = item.css('a::attr(href)').extract_first() detail_url = response.urljoin(href) logger.info('detail url %s', detail_url) yield SeleniumRequest(detail_url, callback=self.parse_detail, wait_for='.item')
def start_requests(self): """ first page :return: """ for page in range(1, self.max_page + 1): url = f'{self.base_url}/page/{page}' logger.debug('start url %s', url) cookies = {'name': 'germey'} yield SeleniumRequest(url, callback=self.parse_index, priority=10, wait_for='.item', pretend=True, cookies=cookies)