def _get_parse_links(self, data, kw, only_results=False, page_num = 1, ip='127.0.0.1'): """Act the same as _parse_links, but just return the db data instead of inserting data into a connection or or building actual queries. [[lastrowid]] needs to be replaced with the last rowid from the database when inserting. Not secure against sql injections from google ~_~ """ parser = GoogleParser(data) if only_results: return parser results = parser.links first = (page_num, time.asctime(), len(results), parser.num_results() or '', kw, ip) second = [] for result in results: second.append([ result.link_title, result.link_url.geturl(), result.link_snippet, result.link_position, result.link_url.hostname ]) return (first, second)
def _get_parse_links(self, data, kw, only_results=False, page_num=1, ip='127.0.0.1'): """Act the same as _parse_links, but just return the db data instead of inserting data into a connection or or building actual queries. [[lastrowid]] needs to be replaced with the last rowid from the database when inserting. Not secure against sql injections from google ~_~ """ parser = GoogleParser(data) if only_results: return parser results = parser.links first = (page_num, time.asctime(), len(results), parser.num_results() or '', kw, ip) second = [] for result in results: second.append([ result.link_title, result.link_url.geturl(), result.link_snippet, result.link_position, result.link_url.hostname ]) return (first, second)
def parse_links(data, conn, kw, page_num=1, ip='127.0.0.1'): """Insert parsed data into the database. High level parsing function. Args: conn -- Either a sqlite3 cursor or connection object. If called in threads, make sure to wrap this function in some kind of synchronization functionality. """ parser = GoogleParser(data) results = parser.links conn.execute(''' INSERT INTO serp_page (page_number, requested_at, num_results, num_results_for_kw_google, search_query, requested_by) VALUES(?, ?, ?, ?, ?, ?)''', (page_num, time.asctime(), len(results), parser.num_results() or '', kw, ip)) lastrowid = conn.lastrowid #logger.debug('Inserting in link: search_query={}, title={}, url={}'.format(kw, )) conn.executemany('''INSERT INTO link ( title, url, snippet, rank, domain, serp_id) VALUES(?, ?, ?, ?, ?, ?)''', [( result.link_title, result.link_url.geturl(), result.link_snippet, result.link_position, result.link_url.hostname) + (lastrowid, ) for result in results])
def build_search(self): """Build the headers and params for the search request for the search engine.""" self.search_params = {} # Don't set the offset parameter explicitly if the default search (no offset) is correct. start_search_position = None if self.current_page == 1 else str( int(self.num_results_per_page) * int(self.current_page)) if self.search_engine == 'google': self.parser = GoogleParser() self.search_params['q'] = self.current_keyword self.search_params['num'] = str(self.num_results_per_page) self.search_params['start'] = start_search_position if self.search_type == 'image': self.search_params.update({ 'oq': self.current_keyword, 'site': 'imghp', 'tbm': 'isch', 'source': 'hp', #'sa': 'X', 'biw': 1920, 'bih': 881 }) elif self.search_type == 'video': self.search_params.update({ 'tbm': 'vid', 'source': 'lnms', 'sa': 'X', 'biw': 1920, 'bih': 881 }) elif self.search_type == 'news': self.search_params.update({ 'tbm': 'nws', 'source': 'lnms', 'sa': 'X' }) elif self.search_engine == 'yandex': self.parser = YandexParser() self.search_params['text'] = self.current_keyword self.search_params['p'] = start_search_position if self.search_type == 'image': self.base_search_url = 'http://yandex.ru/images/search?' elif self.search_engine == 'bing': self.parser = BingParser() self.search_params['q'] = self.current_keyword self.search_params['first'] = start_search_position elif self.search_engine == 'yahoo': self.parser = YahooParser() self.search_params['p'] = self.current_keyword self.search_params['b'] = start_search_position self.search_params['ei'] = 'UTF-8' elif self.search_engine == 'baidu': self.parser = BaiduParser() self.search_params['wd'] = self.current_keyword self.search_params['pn'] = start_search_position self.search_params['ie'] = 'utf-8' elif self.search_engine == 'duckduckgo': self.parser = DuckduckgoParser() self.search_params['q'] = self.current_keyword
def _search(self, searchtype='normal'): """The actual search and parsing of the results. Private, internal method. Parsing is done with lxml and cssselect. The html structure of the Google Search results may change over time. Effective: February 2014 There are several parts of a SERP results page the average user is most likely interested: (Probably in this order) - Non-advertisement links, as well as their little snippet and title - The message that indicates how many results were found. For example: "About 834,000,000 results (0.39 seconds)" - Advertisement search results (links, titles, snippets like above) Problem: This data comes in a wide range of different formats, depending on the parameters set in the search. Investigations over the different formats are done in the directory tests/serp_formats. """ self._build_query(searchtype) # After building the query, all parameters are set, so we know what we're requesting. logger.debug( "Created new GoogleScrape object with searchparams={}".format( pprint.pformat(self.search_params))) html = get_cached(self.search_query, Config['GLOBAL'].get('base_search_url'), params=self.search_params) self.search_results['cache_file'] = os.path.join( Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params)) if not html: try: r = self.requests.get(Config['GLOBAL'].get('base_search_url'), headers=self._HEADERS, params=self.search_params, timeout=3.0) logger.debug("Scraped with url: {} and User-Agent: {}".format( r.url, self._HEADERS['User-Agent'])) except self.requests.ConnectionError as ce: logger.error('Network problem occurred {}'.format(ce)) raise ce except self.requests.Timeout as te: logger.error('Connection timeout {}'.format(te)) raise te if not r.ok: logger.error('HTTP Error: {}'.format(r.status_code)) if str(r.status_code)[0] == '5': print('Maybe google recognizes you as sneaky spammer after' ' you requested their services too inexhaustibly :D') return False html = r.text if Config['HTTP'].getboolean('view', False): self.browserview(html) # cache fresh results cache_results(html, self.search_query, url=Config['GLOBAL'].get('base_search_url'), params=self.search_params) self.search_results['cache_file'] = os.path.join( Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params)) self.parser = GoogleParser(html, searchtype=self.searchtype) self.search_results.update(self.parser.all_results)