class NewsCrawler(Crawler): def __init__(self, config, cache=None): super(NewsCrawler, self).__init__(config, cache) self.scraper = Scraper(config) self.template_complete = False self.data = {} self.templates_done = 0 async def save_response(self, html_code, url, headers, crawl_date): # f*****g mess try: # just let the indexer save the files as normal and also create a Template url = url tree = makeTree(html_code, self.scraper.domain) if self.templates_done < self.scraper.config["max_templates"]: self.templates_done += 1 self.scraper.domain_nodes_dict.add_template_elements(tree) self.scraper.url_to_headers_mapping[url] = headers self.data[url] = self.scraper.process(url, tree, False, ["cleaned"]) self.data[url]["crawl_date"] = crawl_date scrape_date = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime(time.time())) self.data[url]["scrape_date"] = scrape_date except Exception as e: LOGGER.error( "CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r", url, str(e), traceback.format_exc(), ) return def save_data(self, data): raise NotImplementedError("save_data has to be implemented") def save_bulk_data(self, data): raise NotImplementedError("save_bulk_data has to be implemented") def finish_leftovers(self): LOGGER.info("finish leftovers") if self.data: image_set = get_image_set(self.data) LOGGER.info("saving number of documents: %r", len(self.data)) LOGGER.info("found num unique images: %r", len(image_set)) LOGGER.info("saving status code: %r", self.save_bulk_data(self.data)) return dict(self.scraper.domain_nodes_dict)
class NewsCrawler(Crawler): def __init__(self, config, cache=None): super(NewsCrawler, self).__init__(config, cache) self.scraper = Scraper(config) self.template_complete = False self.data = {} self.templates_done = 0 @asyncio.coroutine def save_response(self, html_code, url, headers, crawl_date): # f*****g mess try: # just let the indexer save the files as normal and also create a Template url = url tree = makeTree(html_code, self.scraper.domain) if self.templates_done < self.scraper.config['max_templates']: self.templates_done += 1 self.scraper.domain_nodes_dict.add_template_elements(tree) self.scraper.url_to_headers_mapping[url] = headers self.data[url] = self.scraper.process(url, tree, False, ['cleaned']) self.data[url]['crawl_date'] = crawl_date scrape_date = time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(time.time())) self.data[url]['scrape_date'] = scrape_date except Exception as e: LOGGER.error("CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r", url, str(e), traceback.format_exc()) return def save_data(self, data): raise NotImplementedError('save_data has to be implemented') def save_bulk_data(self, data): raise NotImplementedError('save_bulk_data has to be implemented') def finish_leftovers(self): LOGGER.info('finish leftovers') if self.data: image_set = get_image_set(self.data) LOGGER.info('saving number of documents: %r', len(self.data)) LOGGER.info('found num unique images: %r', len(image_set)) LOGGER.info('saving status code: %r', self.save_bulk_data(self.data)) return dict(self.scraper.domain_nodes_dict)