Exemple #1
0
    def scrape_content_items (self, response):
        hxs = HtmlXPathSelector(response)
        stats = self.crawler.stats
        page_num = hxs.select ('//*[@id="MainContent_DocumentList1_GridView1_PageCurrent"]/@value').extract()
        if page_num:
            page_num = page_num[0]
            self.log('%s Scraping page %s' % (response.meta['cookiejar'], page_num), log.INFO)
        else:
            self.log('%s No page number found' % (response.meta['cookiejar']), log.WARNING)

        stats.inc_value ('_pages', spider=self)
        reports = hxs.select ('//table[@id="MainContent_DocumentList1_GridView1"]//tr')

        for report in reports:
            l = XPathItemLoader(FracFocusScrape(), report)
            l.state_in = lambda slist: [s[:20] for s in slist]
            l.county_in = lambda slist: [s[:20] for s in slist]
            for name, params in FracFocusScrape.fields.items():
                l.add_xpath(name, params['xpath'])
            item = l.load_item()
            if item.get('api'):
                if self.db.itemExists(item):
                    stats.inc_value ('_existing_count', spider=self)
                else:
                    stats.inc_value ('_new_count', spider=self)
#                print item['operator']
                    yield item
        if not stats.get_value('_existing_count') and not stats.get_value('_new_count'):
            self.log('%s No records found' % (response.meta['cookiejar']), log.WARNING)
Exemple #2
0
    def scrape_content_items(self, response):
        hxs = HtmlXPathSelector(response)
        stats = self.crawler.stats
        page_num = hxs.select(
            '//*[@id="MainContent_DocumentList1_GridView1_PageCurrent"]/@value'
        ).extract()
        if page_num:
            page_num = page_num[0]
            self.log(
                '%s Scraping page %s' % (response.meta['cookiejar'], page_num),
                log.INFO)
        else:
            self.log('%s No page number found' % (response.meta['cookiejar']),
                     log.WARNING)

        stats.inc_value('_pages', spider=self)
        reports = hxs.select(
            '//table[@id="MainContent_DocumentList1_GridView1"]//tr')

        for report in reports:
            l = XPathItemLoader(FracFocusScrape(), report)
            l.state_in = lambda slist: [s[:20] for s in slist]
            l.county_in = lambda slist: [s[:20] for s in slist]
            for name, params in FracFocusScrape.fields.items():
                l.add_xpath(name, params['xpath'])
            item = l.load_item()
            if item.get('api'):
                if self.db.itemExists(item):
                    stats.inc_value('_existing_count', spider=self)
                else:
                    stats.inc_value('_new_count', spider=self)
                    #                print item['operator']
                    yield item
        if not stats.get_value('_existing_count') and not stats.get_value(
                '_new_count'):
            self.log('%s No records found' % (response.meta['cookiejar']),
                     log.WARNING)