Ejemplo n.º 1
0
    def save_html(self, url):
        req = Requester()
        try:
            res = req.get(url)
        except ContentDecodingError as e:
            print("Error at saving html: {0}\nURL: {1}".format(e.message, url))
            return
        except ConnectionError:
            return
        soup = BeautifulSoup(res.content, 'html.parser')
        raw_text = soup.get_text("\n", strip=True)
        statements = [line for line in raw_text.split('\n')]
        total_word = len(statements)

        hash_filename = hashlib.sha1(self.db_name + '//' + url).hexdigest()
        with codecs.open(self.db_name + '/' + hash_filename + '.html', 'w', encoding='utf8') as fp:
            fp.write(raw_text)
        cur_db_cursor = self.cur_db.cursor()
        try:
            cur_db_cursor.execute("""
            insert into documents (url, total_word, path)
            values ('{}', {}, '{}')
            """.format(quote(url), total_word, self.db_name + '/' + hash_filename + '.html'))
            self.cur_db.commit()
        except sqlite3.OperationalError as e:
            print("Error: {0}\nURL: {1}\nTotal Words: {2}\nPath: {3}".format(e.message, url, total_word, hash_filename))
        except sqlite3.IntegrityError:
            pass
Ejemplo n.º 2
0
class Google(base.SourceBase):
    """Google
    """

    def __init__(self):
        super(Google, self).__init__()
        self.web_requester = Requester()
        self.url = 'https://www.google.com/search?q={}&start={}'
        self._query = None

    @property
    def query(self):
        return self._query

    @query.setter
    def query(self, query):
        self._query = query

    def get_result(self):
        result = Result('Google')
        for n in range(0, 20, 10):
            response = self.web_requester.get(self.url.format(self.query, n))
            soup = BeautifulSoup(response.content, "html.parser")
            tmp = soup.select('#res')
            if len(tmp) == 0:
                break
            links = [x['href'] for x in soup.select('.srg .rc .r a[onmousedown]')]
            result.add_urls(links)
        return result
Ejemplo n.º 3
0
class Bing(base.SourceBase):
    """Bing
    """

    def __init__(self):
        super(Bing, self).__init__()
        self.web_requester = Requester()
        self.url = 'https://www.bing.com/search?q={}&first={}'
        self._query = None

    @property
    def query(self):
        return self._query

    @query.setter
    def query(self, query):
        self._query = query

    def get_result(self):
        result = Result('Bing')
        for n in range(1, 21, 10):
            response = self.web_requester.get(self.url.format(self.query, n))
            soup = BeautifulSoup(response.content, "html.parser")
            links = [x['href'] for x in soup.select('li.b_algo h2 a')]
            result.add_urls(links)
        return result
Ejemplo n.º 4
0
 def _compute_location_ambiguity(self, df):
     location_df = df[df['e_type'] == 'Location']
     location_se = location_df.drop_duplicates()['entity']
     for location_name in location_se.get_values():
         url = self.google_api_url.format(location_name.encode('utf-8'))
         req = Requester()
         res = req.get(url)
         api_result = json.loads(res.content)
         if api_result['status'] == 'ZERO_RESULTS':
             location_score = 0
         else:
             if api_result['status'] != 'OK':
                 raise RuntimeError('{}: {}'.format(api_result['status'], api_result['error_message']))
             location_types = api_result['results'][0]['address_components'][0]['types']
             if not isinstance(location_types, list) or len(location_types) == 0:
                 continue
             location_score = max(
                 self.__measure_location_ambiguity(location_type) for location_type in location_types)
         location_index = location_se[location_se == location_name].index[0]
         df.set_value(location_index, 'ambiguity', location_score)
     return df
Ejemplo n.º 5
0
 def __init__(self):
     super(Bing, self).__init__()
     self.web_requester = Requester()
     self.url = 'https://www.bing.com/search?q={}&first={}'
     self._query = None
Ejemplo n.º 6
0
 def __init__(self):
     super(Google, self).__init__()
     self.web_requester = Requester()
     self.url = 'https://www.google.com/search?q={}&start={}'
     self._query = None
Ejemplo n.º 7
0
 def __init__(self):
     super(Pipl, self).__init__()
     self.web_requester = Requester()
     self.url = 'https://api.pipl.com/search/v4/?key=sample_key&no_sponsored=true&{}'
     self._query = None
Ejemplo n.º 8
0
class Pipl(base.SourceBase):
    def __init__(self):
        super(Pipl, self).__init__()
        self.web_requester = Requester()
        self.url = 'https://api.pipl.com/search/v4/?key=sample_key&no_sponsored=true&{}'
        self._query = None

    @property
    def query(self):
        return self._query

    @query.setter
    def query(self, query):
        if isinstance(query, str):
            self._query = query
        elif isinstance(query, dict):
            self._query = list()
            if 'FIRST_NAME' in query and 'LAST_NAME' in query:
                self._query.append("first_name={}&last_name={}".format(query['FIRST_NAME'], query['LAST_NAME']))
            if 'EMAIL' in query:
                self._query.append("email={}".format(query['EMAIL']))
            if 'USERNAME' in query:
                self._query.append("username={}".format(query['USERNAME']))

    def get_result(self):
        if isinstance(self.query, str):
            responses = [self.web_requester.get(self.url.format(self.query)), ]
        elif isinstance(self.query, list):
            responses = [self.web_requester.get(self.url.format(query)) for query in self.query]
        else:
            raise TypeError("A query is required")

        person = Person()

        for response in responses:
            data = response.json()

            if data['@http_status_code'] == 403:
                raise RuntimeError('API calls limitation exceeds')
            elif data['@http_status_code'] != 200:
                continue
            if 'person' in data:
                person.parse_json(data['person'])
            if 'possible_persons' in data:
                for possible_person in data['possible_persons']:
                    person.parse_json(possible_person)

                    # try:
                    #     result.add_images([url['url'] for url in data['person']['images']])
                    # except KeyError:
                    #     pass
                    # try:
                    #     result.add_origins([origin['country'] for origin in data['person']['origin_countries']])
                    # except KeyError:
                    #     pass
                    # try:
                    #     result.add_urls([url['url'] for url in data['person']['urls']])
                    # except KeyError:
                    #     pass
                    # try:
                    #     result.add_usernames([username['content'] for username in data['person']['usernames']])
                    # except KeyError:
                    #     pass

        return person