def save_html(self, url): req = Requester() try: res = req.get(url) except ContentDecodingError as e: print("Error at saving html: {0}\nURL: {1}".format(e.message, url)) return except ConnectionError: return soup = BeautifulSoup(res.content, 'html.parser') raw_text = soup.get_text("\n", strip=True) statements = [line for line in raw_text.split('\n')] total_word = len(statements) hash_filename = hashlib.sha1(self.db_name + '//' + url).hexdigest() with codecs.open(self.db_name + '/' + hash_filename + '.html', 'w', encoding='utf8') as fp: fp.write(raw_text) cur_db_cursor = self.cur_db.cursor() try: cur_db_cursor.execute(""" insert into documents (url, total_word, path) values ('{}', {}, '{}') """.format(quote(url), total_word, self.db_name + '/' + hash_filename + '.html')) self.cur_db.commit() except sqlite3.OperationalError as e: print("Error: {0}\nURL: {1}\nTotal Words: {2}\nPath: {3}".format(e.message, url, total_word, hash_filename)) except sqlite3.IntegrityError: pass
class Google(base.SourceBase): """Google """ def __init__(self): super(Google, self).__init__() self.web_requester = Requester() self.url = 'https://www.google.com/search?q={}&start={}' self._query = None @property def query(self): return self._query @query.setter def query(self, query): self._query = query def get_result(self): result = Result('Google') for n in range(0, 20, 10): response = self.web_requester.get(self.url.format(self.query, n)) soup = BeautifulSoup(response.content, "html.parser") tmp = soup.select('#res') if len(tmp) == 0: break links = [x['href'] for x in soup.select('.srg .rc .r a[onmousedown]')] result.add_urls(links) return result
class Bing(base.SourceBase): """Bing """ def __init__(self): super(Bing, self).__init__() self.web_requester = Requester() self.url = 'https://www.bing.com/search?q={}&first={}' self._query = None @property def query(self): return self._query @query.setter def query(self, query): self._query = query def get_result(self): result = Result('Bing') for n in range(1, 21, 10): response = self.web_requester.get(self.url.format(self.query, n)) soup = BeautifulSoup(response.content, "html.parser") links = [x['href'] for x in soup.select('li.b_algo h2 a')] result.add_urls(links) return result
def _compute_location_ambiguity(self, df): location_df = df[df['e_type'] == 'Location'] location_se = location_df.drop_duplicates()['entity'] for location_name in location_se.get_values(): url = self.google_api_url.format(location_name.encode('utf-8')) req = Requester() res = req.get(url) api_result = json.loads(res.content) if api_result['status'] == 'ZERO_RESULTS': location_score = 0 else: if api_result['status'] != 'OK': raise RuntimeError('{}: {}'.format(api_result['status'], api_result['error_message'])) location_types = api_result['results'][0]['address_components'][0]['types'] if not isinstance(location_types, list) or len(location_types) == 0: continue location_score = max( self.__measure_location_ambiguity(location_type) for location_type in location_types) location_index = location_se[location_se == location_name].index[0] df.set_value(location_index, 'ambiguity', location_score) return df
def __init__(self): super(Bing, self).__init__() self.web_requester = Requester() self.url = 'https://www.bing.com/search?q={}&first={}' self._query = None
def __init__(self): super(Google, self).__init__() self.web_requester = Requester() self.url = 'https://www.google.com/search?q={}&start={}' self._query = None
def __init__(self): super(Pipl, self).__init__() self.web_requester = Requester() self.url = 'https://api.pipl.com/search/v4/?key=sample_key&no_sponsored=true&{}' self._query = None
class Pipl(base.SourceBase): def __init__(self): super(Pipl, self).__init__() self.web_requester = Requester() self.url = 'https://api.pipl.com/search/v4/?key=sample_key&no_sponsored=true&{}' self._query = None @property def query(self): return self._query @query.setter def query(self, query): if isinstance(query, str): self._query = query elif isinstance(query, dict): self._query = list() if 'FIRST_NAME' in query and 'LAST_NAME' in query: self._query.append("first_name={}&last_name={}".format(query['FIRST_NAME'], query['LAST_NAME'])) if 'EMAIL' in query: self._query.append("email={}".format(query['EMAIL'])) if 'USERNAME' in query: self._query.append("username={}".format(query['USERNAME'])) def get_result(self): if isinstance(self.query, str): responses = [self.web_requester.get(self.url.format(self.query)), ] elif isinstance(self.query, list): responses = [self.web_requester.get(self.url.format(query)) for query in self.query] else: raise TypeError("A query is required") person = Person() for response in responses: data = response.json() if data['@http_status_code'] == 403: raise RuntimeError('API calls limitation exceeds') elif data['@http_status_code'] != 200: continue if 'person' in data: person.parse_json(data['person']) if 'possible_persons' in data: for possible_person in data['possible_persons']: person.parse_json(possible_person) # try: # result.add_images([url['url'] for url in data['person']['images']]) # except KeyError: # pass # try: # result.add_origins([origin['country'] for origin in data['person']['origin_countries']]) # except KeyError: # pass # try: # result.add_urls([url['url'] for url in data['person']['urls']]) # except KeyError: # pass # try: # result.add_usernames([username['content'] for username in data['person']['usernames']]) # except KeyError: # pass return person