def scrape_content_items (self, response): hxs = HtmlXPathSelector(response) stats = self.crawler.stats page_num = hxs.select ('//*[@id="MainContent_DocumentList1_GridView1_PageCurrent"]/@value').extract() if page_num: page_num = page_num[0] self.log('%s Scraping page %s' % (response.meta['cookiejar'], page_num), log.INFO) else: self.log('%s No page number found' % (response.meta['cookiejar']), log.WARNING) stats.inc_value ('_pages', spider=self) reports = hxs.select ('//table[@id="MainContent_DocumentList1_GridView1"]//tr') for report in reports: l = XPathItemLoader(FracFocusScrape(), report) l.state_in = lambda slist: [s[:20] for s in slist] l.county_in = lambda slist: [s[:20] for s in slist] for name, params in FracFocusScrape.fields.items(): l.add_xpath(name, params['xpath']) item = l.load_item() if item.get('api'): if self.db.itemExists(item): stats.inc_value ('_existing_count', spider=self) else: stats.inc_value ('_new_count', spider=self) # print item['operator'] yield item if not stats.get_value('_existing_count') and not stats.get_value('_new_count'): self.log('%s No records found' % (response.meta['cookiejar']), log.WARNING)
def scrape_content_items(self, response): hxs = HtmlXPathSelector(response) stats = self.crawler.stats page_num = hxs.select( '//*[@id="MainContent_DocumentList1_GridView1_PageCurrent"]/@value' ).extract() if page_num: page_num = page_num[0] self.log( '%s Scraping page %s' % (response.meta['cookiejar'], page_num), log.INFO) else: self.log('%s No page number found' % (response.meta['cookiejar']), log.WARNING) stats.inc_value('_pages', spider=self) reports = hxs.select( '//table[@id="MainContent_DocumentList1_GridView1"]//tr') for report in reports: l = XPathItemLoader(FracFocusScrape(), report) l.state_in = lambda slist: [s[:20] for s in slist] l.county_in = lambda slist: [s[:20] for s in slist] for name, params in FracFocusScrape.fields.items(): l.add_xpath(name, params['xpath']) item = l.load_item() if item.get('api'): if self.db.itemExists(item): stats.inc_value('_existing_count', spider=self) else: stats.inc_value('_new_count', spider=self) # print item['operator'] yield item if not stats.get_value('_existing_count') and not stats.get_value( '_new_count'): self.log('%s No records found' % (response.meta['cookiejar']), log.WARNING)