def parse(self, response): print "%s" % response.url sel = Selector(response) if 'detail_page' in response.meta: yield self.process_detail_page(sel, response) else: rows = sel.xpath('//tr[@offerid]') for tr in rows: vip_an = tr.xpath('td/div/div[@class="vip_an"]') if not vip_an: links = tr.xpath( 'td[@class="txt"]/a[@offerid]/@href').extract() for url in links: url_path = get_url_path(url) if self.process_url_value(url_path): #TODO: Вот тут проверка нужна yield Request(url, callback=self.parse, meta={'detail_page': True}) pager = sel.xpath('//div[@class="pager"]') if pager: next_page = int(response.meta['current_page'] ) + 1 if 'current_page' in response.meta else 2 print next_page url = pager.xpath('a[text()="%s"]/@href' % next_page).extract() if url: yield Request(url[0], callback=self.parse, meta={'current_page': next_page})
def _create_spyder_meta(self, spider, item, status, e=None): spider_meta, created = SpiderMeta.objects.get_or_create(spider=spider, url=get_url_path(item['link'])) # @UnusedVariable spider_meta.status = status if item['phone']: spider_meta.phone = item['phone'][0] spider_meta.phone_filename = item.get('phone_filename') spider_meta.phone_guess = item.get('phone_guess') spider_meta.estate = e url = item['link'] spider_meta.full_url = url if isinstance(url, basestring) else url[0] spider_meta.save() return spider_meta
def parse(self, response): print "%s" % response.url sel = Selector(response) if 'detail_page' in response.meta: yield self.process_detail_page(sel, response) else: rows = sel.xpath('//tr[@offerid]') for tr in rows: vip_an = tr.xpath('td/div/div[@class="vip_an"]') if not vip_an: links = tr.xpath('td[@class="txt"]/a[@offerid]/@href').extract() for url in links: url_path = get_url_path(url) if self.process_url_value(url_path): #TODO: Вот тут проверка нужна yield Request(url, callback=self.parse, meta={'detail_page':True}) pager = sel.xpath('//div[@class="pager"]') if pager: next_page = int(response.meta['current_page']) + 1 if 'current_page' in response.meta else 2 print next_page url = pager.xpath('a[text()="%s"]/@href' % next_page).extract() if url: yield Request(url[0], callback=self.parse, meta={'current_page':next_page})