def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session. Turning down.' with self.db_lock: num_results = 0 serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=self.current_request_time, requested_by=self.ip, query=self.current_keyword, num_results_for_keyword=self.parser.search_results['num_results'], ) self.scraper_search.serps.append(serp) parse_serp(serp=serp, parser=self.parser) self.session.add(serp) self.session.commit() output_format = Config['GLOBAL'].get('output_format', 'stdout') output_file = Config['GLOBAL'].get('output_filename', 'google_scraper') if output_format == 'stdout': out(self.parser, lvl=2) elif output_format == 'json': if not hasattr(self, 'json_outfile'): self.json_outfile = open(output_file + '.json', 'a') obj = self._get_serp_obj() obj['requested_at'] = obj['requested_at'].isoformat() json.dump(obj, self.json_outfile, indent=2, sort_keys=True) elif output_format == 'csv': if not hasattr(self, 'csv_outfile'): self.csv_outfile = csv.DictWriter(open(output_file + '.csv', 'a'), fieldnames=('link', 'title', 'snippet', 'visible_link', 'num_results', 'query', 'search_engine_name', 'requested_by', 'scrapemethod', 'page_number', 'requested_at')) self.csv_outfile.writeheader() rows = [] for result_type, value in self.parser.search_results.items(): if isinstance(value, list): for link in value: rows.append(link) obj = self._get_serp_obj() obj['num_results'] = self.parser.search_results['num_results'] for row in rows: row.update(obj) self.csv_outfile.writerow(row)
def run(self): while True: self.get_requests() if not self.requests: break self.results = self.loop.run_until_complete(asyncio.wait([r()() for r in self.requests])) for task in self.results[0]: scrape = task.result() if scrape: if self.cache_manager: self.cache_manager.cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method, scrape.page_number) if scrape.parser: serp = parse_serp(self.config, parser=scrape.parser, scraper=scrape, query=scrape.query) if self.scraper_search: self.scraper_search.serps.append(serp) if self.session: self.session.add(serp) self.session.commit() store_serp_result(serp, self.config)
def run(self): print("----------------------------start def run(self):-----------------------------") while True: self.get_requests() if not self.requests: break self.results = self.loop.run_until_complete(asyncio.wait([r() for r in self.requests])) for task in self.results[0]: scrape = task.result() if scrape: if self.cache_manager: self.cache_manager.cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method, scrape.page_number) if scrape.parser: serp = parse_serp(self.config, parser=scrape.parser, scraper=scrape, query=scrape.query) if self.scraper_search: self.scraper_search.serps.append(serp) if self.session: self.session.add(serp) self.session.commit() store_serp_result(serp, self.config) print("----------------------------end def run(self):-----------------------------")
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session.' if self.html: self.parser.parse(self.html) else: print("Nothing to parse for {keyword}! (page len = {pagelen})". format(keyword=self.query, pagelen=len(self.html))) self.parser = None with self.db_lock: serp = parse_serp(parser=self.parser, scraper=self, query=self.query) self.scraper_search.serps.append(serp) self.session.add(serp) self.session.commit() store_serp_result(serp) if serp.num_results: return True else: return False
def run(self): while True: self.get_requests() if not self.requests: break self.results = self.loop.run_until_complete(asyncio.wait([r()() for r in self.requests])) for task in self.results[0]: scrape = task.result() if scrape: cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method, scrape.page_number) if scrape.parser: serp = parse_serp(parser=scrape.parser, scraper=scrape, query=scrape.query) self.scraper_search.serps.append(serp) self.session.add(serp) self.session.commit() store_serp_result(serp)
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session.' if self.html: self.parser.parse(self.html) else: self.parser = None with self.db_lock: serp = parse_serp(self.config, parser=self.parser, scraper=self, query=self.query) self.scraper_search.serps.append(serp) self.session.add(serp) self.session.commit() store_serp_result(serp, self.config) if serp.num_results: return True else: return False
def parse_again(fname, search_engine, scrapemethod, query): html = read_cached_file(get_path(fname)) return parse_serp(html=html, search_engine=search_engine, scrapemethod=scrapemethod, current_page=0, current_keyword=query)
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session. Turning down.' with self.db_lock: num_results = 0 serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=self.current_request_time, requested_by=self.ip, query=self.current_keyword, num_results_for_keyword=self.parser. search_results['num_results'], ) self.scraper_search.serps.append(serp) parse_serp(serp=serp, parser=self.parser) self.session.add(serp) self.session.commit() output_format = Config['GLOBAL'].get('output_format', 'stdout') output_file = Config['GLOBAL'].get('output_filename', 'google_scraper') def results(): rows = [] for result_type, value in self.parser.search_results.items(): if isinstance(value, list): for link in value: rows.append(link) return rows if output_format == 'stdout': out(self.parser, lvl=2) elif output_format == 'json': obj = self._get_serp_obj() obj['results'] = results() json.dump(obj, self.json_outfile, indent=2, sort_keys=True) self.json_outfile.write(',') elif output_format == 'csv': obj = self._get_serp_obj() for row in results(): row.update(obj) self.csv_outfile.writerow(row)
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session. Turning down.' with self.db_lock: num_results = 0 serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=self.current_request_time, requested_by=self.ip, query=self.current_keyword, num_results_for_keyword=self.parser.search_results['num_results'], ) self.scraper_search.serps.append(serp) parse_serp(serp=serp, parser=self.parser) self.session.add(serp) self.session.commit() output_format = Config['GLOBAL'].get('output_format', 'stdout') output_file = Config['GLOBAL'].get('output_filename', 'google_scraper') def results(): rows = [] for result_type, value in self.parser.search_results.items(): if isinstance(value, list): for link in value: rows.append(link) return rows if output_format == 'stdout': out(self.parser, lvl=2) elif output_format == 'json': obj = self._get_serp_obj() obj['results'] = results() json.dump(obj, self.json_outfile, indent=2, sort_keys=True) self.json_outfile.write(',') elif output_format == 'csv': obj = self._get_serp_obj() for row in results(): row.update(obj) self.csv_outfile.writerow(row)
def parse_again(fname, search_engine, scrapemethod, query): html = read_cached_file(get_path(fname)) return parse_serp( html=html, search_engine=search_engine, scrapemethod=scrapemethod, current_page=0, current_keyword=query )
def parse_again(fname, search_engine, scrape_method, query): """ @todo: `scrape_method` is not used here -> check if scrape_method is passed to this function and remove it """ html = read_cached_file(get_path(fname)) return parse_serp( html=html, search_engine=search_engine, query=query )
def parse_again(self, fname, search_engine, scrape_method, query): """ @todo: `scrape_method` is not used here -> check if scrape_method is passed to this function and remove it """ path = os.path.join(self.config.get('cachedir', '.scrapecache'), fname) html = self.read_cached_file(path) return parse_serp(self.config, html=html, search_engine=search_engine, query=query)
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session. Turning down.' with self.db_lock: num_results = 0 serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=datetime.datetime.utcnow(), requested_by=self.ip, query=self.current_keyword, num_results_for_keyword=self.parser.search_results['num_results'], ) self.scraper_search.serps.append(serp) parse_serp(serp=serp, parser=self.parser) self.session.add(serp) self.session.commit()
def parse_again(self, fname, search_engine, scrape_method, query): """ @todo: `scrape_method` is not used here -> check if scrape_method is passed to this function and remove it """ path = os.path.join(self.config.get('cachedir', '.scrapecache'), fname) html = self.read_cached_file(path) return parse_serp( self.config, html=html, search_engine=search_engine, query=query )
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session. Turning down.' with self.db_lock: serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=self.current_request_time, requested_by=self.ip, query=self.current_keyword, num_results_for_keyword=self.parser. search_results['num_results'], ) self.scraper_search.serps.append(serp) serp, parser = parse_serp(serp=serp, parser=self.parser) self.session.add(serp) self.session.commit() store_serp_result(dict_from_scraping_object(self), self.parser)
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session.' if self.html: self.parser.parse(self.html) else: self.parser = None with self.db_lock: serp = parse_serp(parser=self.parser, scraper=self, query=self.query) self.scraper_search.serps.append(serp) self.session.add(serp) self.session.commit() store_serp_result(serp) if serp.num_results: return True else: return False
def parse_again(fname, search_engine, scrape_method, query): """ @todo: `scrape_method` is not used here -> check if scrape_method is passed to this function and remove it """ html = read_cached_file(get_path(fname)) return parse_serp(html=html, search_engine=search_engine, query=query)
def parse_all_cached_files(keywords, search_engines, session, scraper_search, try_harder=False): """Walk recursively through the cachedir (as given by the Config) and parse all cached files. Args: identifying_list: A list of list with elements that identify the result. The consist of the keyword, search_engine, scrapemode. session: An sql alchemy session to add the entities try_harder: If there is a cache file that cannot be mapped to a keyword, read it and try it again and try to extract the search query from the html. Returns: A list of keywords that couldn't be parsed and which need to be scraped anew. """ google_query_needle = re.compile(r'<title>(?P<kw>.*?) - Google Search</title>') files = _get_all_cache_files() mapping = {} scrapemethod = Config['SCRAPING'].get('scrapemethod') num_cached = 0 for kw in keywords: for search_engine in search_engines: key = cached_file_name(kw, search_engine, scrapemethod) out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}'.format( kw=kw, se=search_engine, sm=scrapemethod, hash=key ), lvl=5) mapping[key] = (kw, search_engine) for path in files: # strip of the extension of the path if it has eny fname = os.path.split(path)[1] clean_filename = fname for ext in ALLOWED_COMPRESSION_ALGORITHMS: if fname.endswith(ext): clean_filename = fname.rstrip('.' + ext) query = search_engine = None val = mapping.get(clean_filename, None) if val: query, search_engine = val if query and search_engine: # We found a file that contains the keyword, search engine name and # searchmode that fits our description. Let's see if there is already # an record in the database and link it to our new ScraperSearch object. try: serp = session.query(SearchEngineResultsPage).filter( SearchEngineResultsPage.query == query, SearchEngineResultsPage.search_engine_name == search_engine, SearchEngineResultsPage.scrapemethod == scrapemethod).one() except NoResultFound as e: # that shouldn't happen # we have a cache file that matches the above identifying information # but it was never stored to the database. logger.error('No entry for file {} found in database. Will parse again.'.format(clean_filename)) html = read_cached_file(get_path(fname)) serp = parse_serp( html=html, search_engine=search_engine, scrapemethod=scrapemethod, current_page=0, current_keyword=query ) except MultipleResultsFound as e: raise e if serp: scraper_search.serps.append(serp) mapping.pop(clean_filename) num_cached += 1 out('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir')), lvl=1) out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.'.format( num_cached, len(keywords), len(keywords) - num_cached), lvl=1) session.add(scraper_search) session.commit() # return the remaining keywords to scrape return [e[0] for e in mapping.values()]
def parse_all_cached_files(keywords, session, scraper_search, try_harder=False): """Walk recursively through the cachedir (as given by the Config) and parse all cached files. Args: identifying_list: A list of list with elements that identify the result. The consist of the keyword, search_engine, scrapemode. session: An sql alchemy session to add the entities try_harder: If there is a cache file that cannot be mapped to a keyword, read it and try it again and try to extract the search query from the html. Returns: A list of keywords that couldn't be parsed and which need to be scraped anew. """ google_query_needle = re.compile( r'<title>(?P<kw>.*?) - Google Search</title>') files = _get_all_cache_files() mapping = {} search_engine = Config['SCRAPING'].get('search_engine') scrapemethod = Config['SCRAPING'].get('scrapemethod') for kw in keywords: key = cached_file_name(kw, search_engine, scrapemethod) out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}' .format(kw=kw, se=search_engine, sm=scrapemethod, hash=key), lvl=5) mapping[key] = kw for path in files: # strip of the extension of the path if it has eny fname = os.path.split(path)[1] clean_filename = fname for ext in ALLOWED_COMPRESSION_ALGORITHMS: if fname.endswith(ext): clean_filename = fname.rstrip('.' + ext) query = mapping.get(clean_filename, None) if query: # We found a file that contains the keyword, search engine name and # searchmode that fits our description. Let's see if there is already # an record in the database and link it to our new ScraperSearch object. try: serp = session.query(SearchEngineResultsPage).filter( SearchEngineResultsPage.query == query, SearchEngineResultsPage.search_engine_name == search_engine, SearchEngineResultsPage.scrapemethod == scrapemethod).one() except NoResultFound as e: # that shouldn't happen # we have a cache file that matches the above identifying information # but it was never stored to the database. logger.error( 'No entry for file {} found in database. Will parse again.' .format(clean_filename)) serp = parse_serp(html=read_cached_file(get_path(fname)), search_engine=search_engine, scrapemethod=scrapemethod, current_page=0, current_keyword=query) except MultipleResultsFound as e: raise e finally: scraper_search.serps.append(serp) mapping.pop(clean_filename) # TODO: support query detection for all supported search engines # by parsing the keyword, search engine from the raw html out('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir')), lvl=1) out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.' .format(len(keywords) - len(mapping), len(keywords), len(mapping)), lvl=1) session.add(scraper_search) session.commit() # return the remaining keywords to scrape return mapping.values()
def parse_again(fname, search_engine, scrape_method, query): html = read_cached_file(get_path(fname)) return parse_serp(html=html, search_engine=search_engine, query=query)