Beispiel #1
0
    def __init__(self):
        threading.Thread.__init__(self)

        CrawlWindow.count += 1
        self.id = CrawlWindow.count

        chrome_options = Options()
        chrome_options.add_argument("--window-size=1024x768")
        # chrome_options.add_argument("--headless")
        chrome_options.add_argument('log-level=3')
        agent = ua.random
        print('Window {0} using user agent {1}'.format(self.id, agent))
        chrome_options.add_argument('user-agent={0}'.format(ua.random))
        self.driver = webdriver.Chrome(options=chrome_options)

        self.conn, self.cur = connect_to_gcp()
        print('Window {0} successfully connected to DB'.format(self.id))
Beispiel #2
0
#!/usr/bin/env python3

import gcp
from bs4 import BeautifulSoup

version = 12  # increment version to go through pages we are uncertain about again
batch_size = 20

conn, cur = gcp.connect_to_gcp()

print('Connected to DB')


def handle_featured_snippet(snippet):
    short_answer_div = snippet.find('div', attrs={'data-tts': 'answers'})
    short_answer = None
    if short_answer_div:
        short_answer = short_answer_div.get_text()
        short_answer_div.parent.decompose(
        )  # make it easier to find long answer
    long_div = snippet.find('div', attrs={'role': 'heading'})
    if long_div and long_div.span:
        long_answer = long_div.span.get_text()
        return 'feat_snip', short_answer, long_answer
    else:
        ol = snippet.find('ol')
        ul = snippet.find('ul')
        if ol and not ol.has_attr('role'):  # see 6916 and 4143239
            long_list = [x.get_text() for x in ol.find_all('li')]
            return 'rich_list', short_answer, str(long_list)
        elif ul: