def scrape_data(self): # Create boilerplate recognizer skindex = Scraper(self.crawl_config) skindex.load_local_pages() skindex.add_template_elements() # Process all by removing boilerplate and extracting information return skindex.process_all(exclude_data=['cleaned', 'author'])
class NewsCrawler(Crawler): def __init__(self, config, cache=None): super(NewsCrawler, self).__init__(config, cache) self.scraper = Scraper(config) self.template_complete = False self.data = {} self.templates_done = 0 async def save_response(self, html_code, url, headers, crawl_date): # f*****g mess try: # just let the indexer save the files as normal and also create a Template url = url tree = makeTree(html_code, self.scraper.domain) if self.templates_done < self.scraper.config["max_templates"]: self.templates_done += 1 self.scraper.domain_nodes_dict.add_template_elements(tree) self.scraper.url_to_headers_mapping[url] = headers self.data[url] = self.scraper.process(url, tree, False, ["cleaned"]) self.data[url]["crawl_date"] = crawl_date scrape_date = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime(time.time())) self.data[url]["scrape_date"] = scrape_date except Exception as e: LOGGER.error( "CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r", url, str(e), traceback.format_exc(), ) return def save_data(self, data): raise NotImplementedError("save_data has to be implemented") def save_bulk_data(self, data): raise NotImplementedError("save_bulk_data has to be implemented") def finish_leftovers(self): LOGGER.info("finish leftovers") if self.data: image_set = get_image_set(self.data) LOGGER.info("saving number of documents: %r", len(self.data)) LOGGER.info("found num unique images: %r", len(image_set)) LOGGER.info("saving status code: %r", self.save_bulk_data(self.data)) return dict(self.scraper.domain_nodes_dict)
class NewsCrawler(Crawler): def __init__(self, config, cache=None): super(NewsCrawler, self).__init__(config, cache) self.scraper = Scraper(config) self.template_complete = False self.data = {} self.templates_done = 0 @asyncio.coroutine def save_response(self, html_code, url, headers, crawl_date): # f*****g mess try: # just let the indexer save the files as normal and also create a Template url = url tree = makeTree(html_code, self.scraper.domain) if self.templates_done < self.scraper.config['max_templates']: self.templates_done += 1 self.scraper.domain_nodes_dict.add_template_elements(tree) self.scraper.url_to_headers_mapping[url] = headers self.data[url] = self.scraper.process(url, tree, False, ['cleaned']) self.data[url]['crawl_date'] = crawl_date scrape_date = time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime(time.time())) self.data[url]['scrape_date'] = scrape_date except Exception as e: LOGGER.error("CRITICAL ERROR IN SCRAPER for url %r: %r, stack %r", url, str(e), traceback.format_exc()) return def save_data(self, data): raise NotImplementedError('save_data has to be implemented') def save_bulk_data(self, data): raise NotImplementedError('save_bulk_data has to be implemented') def finish_leftovers(self): LOGGER.info('finish leftovers') if self.data: image_set = get_image_set(self.data) LOGGER.info('saving number of documents: %r', len(self.data)) LOGGER.info('found num unique images: %r', len(image_set)) LOGGER.info('saving status code: %r', self.save_bulk_data(self.data)) return dict(self.scraper.domain_nodes_dict)
'index_filter_regexps': [ ], 'index_required_regexps': [ '2015', '2014' ], 'max_saved_responses': 100, 'max_workers': 10, }) crawl.start(CRAWL_CONFIG) # Indexing SCRAPE_CONFIG = CRAWL_CONFIG.copy() SCRAPE_CONFIG.update({ 'template_proportion': 0.09, 'max_templates': 1000 }) skindex = Scraper(SCRAPE_CONFIG) skindex.load_local_pages() skindex.add_template_elements() res = skindex.process_all(remove_visuals=True)
# Crawling CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG CRAWL_CONFIG.update({ 'seed_urls': ['http://www.techcrunch.com/'], 'collections_path': '/Users/pascal/egoroot/sky_collections', 'collection_name': 'techie', # Optional 'crawl_filter_regexps': [], 'crawl_required_regexps': ['2015', '2014'], 'index_filter_regexps': [], 'index_required_regexps': ['2015', '2014'], 'max_saved_responses': 100, 'max_workers': 10, }) crawl.start(CRAWL_CONFIG) # Indexing SCRAPE_CONFIG = CRAWL_CONFIG.copy() SCRAPE_CONFIG.update({'template_proportion': 0.09, 'max_templates': 1000}) skindex = Scraper(SCRAPE_CONFIG) skindex.load_local_pages() skindex.add_template_elements() res = skindex.process_all(remove_visuals=True)
def post(self): CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG CRAWL_CONFIG.update({ 'collections_path': os.path.join(os.path.expanduser('~'), 'sky_collections/'), # 'max_workers': 10, }) args = self.request.arguments print(args) for arg in args: value = args[arg][0].decode('utf8') if value and arg != 'url' and arg != 'checkboxcache': print('pre', arg, CRAWL_CONFIG[arg]) if isinstance(CRAWL_CONFIG[arg], list): CRAWL_CONFIG[arg] = [int(value)] if is_numeric(value) else [value] else: CRAWL_CONFIG[arg] = int(value) if is_numeric(value) else value print('post', arg, CRAWL_CONFIG[arg]) url = self.get_argument('url', '') use_cache = self.get_argument('checkboxcache', '') domain = extractDomain(url) CRAWL_CONFIG['seed_urls'] = [url] CRAWL_CONFIG['collection_name'] = domain[7:] if use_cache != 'on': col_path = os.path.join(CRAWL_CONFIG['collections_path'], CRAWL_CONFIG['collection_name']) print(col_path) if os.path.exists(col_path): shutil.rmtree(col_path) crawl.start(CRAWL_CONFIG) SCRAPE_CONFIG = CRAWL_CONFIG.copy() SCRAPE_CONFIG.update({ 'template_proportion': 0.4, 'max_templates': 100 }) skindex = Scraper(SCRAPE_CONFIG) skindex.load_local_pages() skindex.add_template_elements() res = skindex.process_all(remove_visuals=True, maxn=CRAWL_CONFIG['max_saved_responses']) items = [] for num, url in enumerate(res): if num == CRAWL_CONFIG['max_saved_responses']: break dc = res[url] dc['url'] = url dc['source_name'] = domain dc['images'] = [x for x in reversed(dc['images'][:5])] # dc['blobs'] = [TextBlob(x) for x in dc['body'] if dc['body']] items.append(dc) # this is quite out of place like this print('num unique images', len(get_image_set({x['url']: x for x in items}))) if items and 'money' in items[0]: items = sorted(items, key=lambda x: len(x['money']), reverse=True) self.render('page_template.html', items=items, cached=False)
def post(self): CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG.copy() CRAWL_CONFIG.update( { "collections_path": os.path.join(os.path.expanduser("~"), "sky_view_collections/"), # 'max_workers': 10, } ) args = self.request.arguments print(args) for arg in args: value = args[arg][0].decode("utf8") if value and arg != "url" and arg != "checkboxcache": print("pre", arg, CRAWL_CONFIG[arg]) if isinstance(CRAWL_CONFIG[arg], list): CRAWL_CONFIG[arg] = [int(value)] if is_numeric(value) else value.split(", ") else: CRAWL_CONFIG[arg] = int(value) if is_numeric(value) else value.split(", ")[0] print("post", arg, CRAWL_CONFIG[arg]) url = self.get_argument("url", "") use_cache = self.get_argument("checkboxcache", "") domain = extractDomain(url) CRAWL_CONFIG["seed_urls"] = [url] CRAWL_CONFIG["collection_name"] = domain[7:] if use_cache != "on": col_path = os.path.join(CRAWL_CONFIG["collections_path"], CRAWL_CONFIG["collection_name"]) print(col_path) if os.path.exists(col_path): shutil.rmtree(col_path) crawl.start(CRAWL_CONFIG) SCRAPE_CONFIG = CRAWL_CONFIG.copy() SCRAPE_CONFIG.update({"template_proportion": 0.4, "max_templates": 100}) skindex = Scraper(SCRAPE_CONFIG) skindex.load_local_pages() skindex.add_template_elements() res = skindex.process_all(remove_visuals=True, maxn=CRAWL_CONFIG["max_saved_responses"]) items = [] for num, url in enumerate(res): if num == CRAWL_CONFIG["max_saved_responses"]: break dc = res[url] dc["url"] = url dc["source_name"] = domain dc["images"] = [x for x in reversed(dc["images"][:5])] # dc['blobs'] = [TextBlob(x) for x in dc['body'] if dc['body']] items.append(dc) # this is quite out of place like this print("num unique images", len(get_image_set({x["url"]: x for x in items}))) if items and "money" in items[0]: items = sorted(items, key=lambda x: len(x["money"]), reverse=True) self.render("page_template.html", items=items, cached=False)
def post(self): CRAWL_CONFIG = DEFAULT_CRAWL_CONFIG.copy() CRAWL_CONFIG.update({ 'collections_path': os.path.join(os.path.expanduser('~'), 'sky_view_collections/'), # 'max_workers': 10, }) args = self.request.arguments print(args) for arg in args: value = args[arg][0].decode('utf8') if value and arg != 'url' and arg != 'checkboxcache': print('pre', arg, CRAWL_CONFIG[arg]) if isinstance(CRAWL_CONFIG[arg], list): CRAWL_CONFIG[arg] = [int(value)] if is_numeric(value) else value.split(', ') else: CRAWL_CONFIG[arg] = int(value) if is_numeric(value) else value.split(', ')[0] print('post', arg, CRAWL_CONFIG[arg]) url = self.get_argument('url', '') use_cache = self.get_argument('checkboxcache', '') domain = extractDomain(url) CRAWL_CONFIG['seed_urls'] = [url] if domain.startswith("http"): CRAWL_CONFIG['collection_name'] = domain.split("/")[2] else: CRAWL_CONFIG['collection_name'] = domain.split("/")[0] if use_cache != 'on': col_path = os.path.join(CRAWL_CONFIG['collections_path'], CRAWL_CONFIG['collection_name']) print(col_path) if os.path.exists(col_path): shutil.rmtree(col_path) crawl.start(CRAWL_CONFIG) SCRAPE_CONFIG = CRAWL_CONFIG.copy() SCRAPE_CONFIG.update({ 'template_proportion': 0.4, 'max_templates': 100, }) skindex = Scraper(SCRAPE_CONFIG) skindex.load_local_pages() skindex.add_template_elements() res = skindex.process_all(remove_visuals=True, maxn=CRAWL_CONFIG['max_saved_responses']) items = [] for num, url in enumerate(res): if num == CRAWL_CONFIG['max_saved_responses']: break dc = res[url] dc['url'] = url dc['source_name'] = domain dc['images'] = [x for x in reversed(dc['images'][:5])] # dc['blobs'] = [TextBlob(x) for x in dc['body'] if dc['body']] items.append(dc) # this is quite out of place like this print('num unique images', len(get_image_set({x['url']: x for x in items}))) if items and 'money' in items[0]: items = sorted(items, key=lambda x: len(x['money']), reverse=True) self.render('page_template.html', items=items, cached=False)
def __init__(self, config, cache=None): super(NewsCrawler, self).__init__(config, cache) self.scraper = Scraper(config) self.template_complete = False self.data = {} self.templates_done = 0