Esempio n. 1
0
class NewsCrawler(Crawler):
    def __init__(self, config, cache=None):
        super(NewsCrawler, self).__init__(config, cache)
        self.scraper = Scraper(config)
        self.template_complete = False
        self.data = {}
        self.templates_done = 0

    async def save_response(self, html_code, url, headers, crawl_date):
        # f*****g mess
        try:
            # just let the indexer save the files as normal and also create a Template
            url = url
            tree = makeTree(html_code, self.scraper.domain)
            if self.templates_done < self.scraper.config["max_templates"]:
                self.templates_done += 1
                self.scraper.domain_nodes_dict.add_template_elements(tree)
                self.scraper.url_to_headers_mapping[url] = headers
            self.data[url] = self.scraper.process(url, tree, False,
                                                  ["cleaned"])
            self.data[url]["crawl_date"] = crawl_date
            scrape_date = time.strftime("%Y-%m-%dT%H:%M:%S",
                                        time.localtime(time.time()))
            self.data[url]["scrape_date"] = scrape_date
        except Exception as e:
            LOGGER.error(
                "CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r",
                url,
                str(e),
                traceback.format_exc(),
            )
        return

    def save_data(self, data):
        raise NotImplementedError("save_data has to be implemented")

    def save_bulk_data(self, data):
        raise NotImplementedError("save_bulk_data has to be implemented")

    def finish_leftovers(self):
        LOGGER.info("finish leftovers")
        if self.data:
            image_set = get_image_set(self.data)
            LOGGER.info("saving number of documents: %r", len(self.data))
            LOGGER.info("found num unique images: %r", len(image_set))
            LOGGER.info("saving status code: %r",
                        self.save_bulk_data(self.data))
        return dict(self.scraper.domain_nodes_dict)
Esempio n. 2
0
class NewsCrawler(Crawler):

    def __init__(self, config, cache=None):
        super(NewsCrawler, self).__init__(config, cache)
        self.scraper = Scraper(config)
        self.template_complete = False
        self.data = {}
        self.templates_done = 0

    @asyncio.coroutine
    def save_response(self, html_code, url, headers, crawl_date):
        # f*****g mess
        try:
            # just let the indexer save the files as normal and also create a Template
            url = url
            tree = makeTree(html_code, self.scraper.domain)
            if self.templates_done < self.scraper.config['max_templates']:
                self.templates_done += 1
                self.scraper.domain_nodes_dict.add_template_elements(tree)
                self.scraper.url_to_headers_mapping[url] = headers
            self.data[url] = self.scraper.process(url, tree, False, ['cleaned'])
            self.data[url]['crawl_date'] = crawl_date
            scrape_date = time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(time.time()))
            self.data[url]['scrape_date'] = scrape_date
        except Exception as e:
            LOGGER.error("CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r",
                         url, str(e), traceback.format_exc())
        return

    def save_data(self, data):
        raise NotImplementedError('save_data has to be implemented')

    def save_bulk_data(self, data):
        raise NotImplementedError('save_bulk_data has to be implemented')

    def finish_leftovers(self):
        LOGGER.info('finish leftovers')
        if self.data:
            image_set = get_image_set(self.data)
            LOGGER.info('saving number of documents: %r', len(self.data))
            LOGGER.info('found num unique images: %r', len(image_set))
            LOGGER.info('saving status code: %r', self.save_bulk_data(self.data))
        return dict(self.scraper.domain_nodes_dict)