def parse_serp(html=None, search_engine=None, scrapemethod=None, current_page=None, requested_at=None, requested_by='127.0.0.1', current_keyword=None, parser=None, serp=None): """Store the parsed data in the sqlalchemy session. Args: TODO: A whole lot Returns: The parsed SERP object. """ if not parser: parser = get_parser_by_search_engine(search_engine) parser = parser() parser.parse(html) out(parser, lvl=2) num_results = 0 if not serp: serp = SearchEngineResultsPage( search_engine_name=search_engine, scrapemethod=scrapemethod, page_number=current_page, requested_at=requested_at, requested_by=requested_by, query=current_keyword, num_results_for_keyword=parser.search_results['num_results'], ) for key, value in parser.search_results.items(): if isinstance(value, list): rank = 1 for link in value: parsed = urlparse(link['link']) # fill with nones to prevent key errors [link.update({key: None}) for key in ('snippet', 'title', 'visible_link') if key not in link] l = Link( link=link['link'], snippet=link['snippet'], title=link['title'], visible_link=link['visible_link'], domain=parsed.netloc, rank=rank, serp=serp ) num_results += 1 rank += 1 serp.num_results = num_results return (serp, parser)
def parse_serp(html=None, search_engine=None, scrapemethod=None, current_page=None, requested_at=None, requested_by='127.0.0.1', current_keyword=None, parser=None, serp=None): """Store the parsed data in the sqlalchemy session. Args: TODO: A whole lot Returns: The parsed SERP object. """ if not parser: parser = get_parser_by_search_engine(search_engine) parser = parser() parser.parse(html) out(parser, lvl=2) num_results = 0 if not serp: serp = SearchEngineResultsPage( search_engine_name=search_engine, scrapemethod=scrapemethod, page_number=current_page, requested_at=requested_at, requested_by=requested_by, query=current_keyword, num_results_for_keyword=parser.search_results['num_results'], ) for key, value in parser.search_results.items(): if isinstance(value, list): rank = 1 for link in value: parsed = urlparse(link['link']) # fill with nones to prevent key errors [link.update({key: None}) for key in ('snippet', 'title', 'visible_link') if key not in link] l = Link( link=link['link'], snippet=link['snippet'], title=link['title'], visible_link=link['visible_link'], domain=parsed.netloc, rank=rank, serp=serp ) num_results += 1 rank += 1 serp.num_results = num_results return serp
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'You need to pass a sqlalchemy scoped session to SearchEngineScrape instances' serp = SearchEngineResultsPage( search_engine_name=self.search_engine, page_number=self.current_page, requested_at=datetime.datetime.utcnow(), requested_by='127.0.0.1', query=self.current_keyword, num_results_for_keyword=self.parser.search_results['num_results'], search=self.scraper_search) self.session.add(serp) for key, value in self.parser.search_results.items(): if isinstance(value, list): for link in value: l = Link(url=link['link'], snippet=link['snippet'], title=link['title'], visible_link=link['visible_link'], serp=serp) self.session.add(l) self.session.commit()
def parse_serp(html=None, search_engine=None, scrapemethod=None, current_page=None, requested_at=None, requested_by='127.0.0.1', current_keyword=None): """Store the parsed data in the sqlalchemy session. Args: TODO: A whole lot Returns: The parsed SERP object. """ parser = get_parser_by_search_engine(search_engine) parser = parser() parser.parse(html) num_results = 0 serp = SearchEngineResultsPage( search_engine_name=search_engine, scrapemethod=scrapemethod, page_number=current_page, requested_at=requested_at, requested_by=requested_by, query=current_keyword, num_results_for_keyword=parser.search_results['num_results'], ) for key, value in parser.search_results.items(): if isinstance(value, list): rank = 1 for link in value: l = Link( url=link['link'], snippet=link['snippet'], title=link['title'], visible_link=link['visible_link'], rank=rank, serp=serp ) num_results += 1 rank += 1 serp.num_results = num_results return serp
def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None, query=''): """Store the parsed data in the sqlalchemy session. If no parser is supplied then we are expected to parse again with the provided html. This function may be called from scraping and caching. When called from caching, some info is lost (like current page number). Args: TODO: A whole lot Returns: The parsed SERP object. """ if not parser and html: parser = get_parser_by_search_engine(search_engine) parser = parser(config, query=query) parser.parse(html) serp = SearchEngineResultsPage() if query: serp.query = query if parser: serp.set_values_from_parser(parser) if scraper: serp.set_values_from_scraper(scraper) return serp
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'You need to pass a sqlalchemy scoped session to SearchEngineScrape instances' num_results = 0 ip = '127.0.0.1' if self.proxy: ip = self.proxy.ip serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=datetime.datetime.utcnow(), requested_by=ip, query=self.current_keyword, num_results_for_keyword=self.parser.search_results['num_results'], ) with (yield from self.db_lock): self.scraper_search.serps.append(serp) self.session.add(serp) self.session.add(self.scraper_search) self.session.commit() for key, value in self.parser.search_results.items(): if isinstance(value, list): rank = 1 for link in value: l = Link( url=link['link'], snippet=link['snippet'], title=link['title'], visible_link=link['visible_link'], rank=rank, serp=serp ) self.session.add(l) num_results += 1 rank += 1 serp.num_results = num_results self.session.add(serp) self.session.commit()
def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None, query=''): """Store the parsed data in the sqlalchemy session. If no parser is supplied then we are expected to parse again with the provided html. This function may be called from scraping and caching. When called from caching, some info is lost (like current page number). Args: TODO: A whole lot Returns: The parsed SERP object. """ if not parser and html: parser = get_parser_by_search_engine(search_engine) parser = parser(config, query=query) parser.parse(html) serp = SearchEngineResultsPage() if query: serp.query = query if parser: serp.set_values_from_parser(parser) if scraper: serp.set_values_from_scraper(scraper) return serp
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session. Turning down.' with self.db_lock: num_results = 0 serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=datetime.datetime.utcnow(), requested_by=self.ip, query=self.current_keyword, num_results_for_keyword=self.parser.search_results['num_results'], ) self.scraper_search.serps.append(serp) for key, value in self.parser.search_results.items(): if isinstance(value, list): rank = 1 for link in value: l = Link( url=link['link'], snippet=link['snippet'], title=link['title'], visible_link=link['visible_link'], rank=rank, serp=serp ) self.session.add(l) num_results += 1 rank += 1 serp.num_results = num_results self.session.add(serp) self.session.commit()
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session. Turning down.' with self.db_lock: num_results = 0 serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=self.current_request_time, requested_by=self.ip, query=self.current_keyword, num_results_for_keyword=self.parser. search_results['num_results'], ) self.scraper_search.serps.append(serp) parse_serp(serp=serp, parser=self.parser) self.session.add(serp) self.session.commit() output_format = Config['GLOBAL'].get('output_format', 'stdout') output_file = Config['GLOBAL'].get('output_filename', 'google_scraper') def results(): rows = [] for result_type, value in self.parser.search_results.items(): if isinstance(value, list): for link in value: rows.append(link) return rows if output_format == 'stdout': out(self.parser, lvl=2) elif output_format == 'json': obj = self._get_serp_obj() obj['results'] = results() json.dump(obj, self.json_outfile, indent=2, sort_keys=True) self.json_outfile.write(',') elif output_format == 'csv': obj = self._get_serp_obj() for row in results(): row.update(obj) self.csv_outfile.writerow(row)
def store(self): """Store the parsed data in the sqlalchemy scoped session.""" assert self.session, 'No database session. Turning down.' with self.db_lock: serp = SearchEngineResultsPage( search_engine_name=self.search_engine, scrapemethod=self.scrapemethod, page_number=self.current_page, requested_at=self.current_request_time, requested_by=self.ip, query=self.current_keyword, num_results_for_keyword=self.parser. search_results['num_results'], ) self.scraper_search.serps.append(serp) serp, parser = parse_serp(serp=serp, parser=self.parser) self.session.add(serp) self.session.commit() store_serp_result(dict_from_scraping_object(self), self.parser)