def __init__(self): threading.Thread.__init__(self) CrawlWindow.count += 1 self.id = CrawlWindow.count chrome_options = Options() chrome_options.add_argument("--window-size=1024x768") # chrome_options.add_argument("--headless") chrome_options.add_argument('log-level=3') agent = ua.random print('Window {0} using user agent {1}'.format(self.id, agent)) chrome_options.add_argument('user-agent={0}'.format(ua.random)) self.driver = webdriver.Chrome(options=chrome_options) self.conn, self.cur = connect_to_gcp() print('Window {0} successfully connected to DB'.format(self.id))
#!/usr/bin/env python3 import gcp from bs4 import BeautifulSoup version = 12 # increment version to go through pages we are uncertain about again batch_size = 20 conn, cur = gcp.connect_to_gcp() print('Connected to DB') def handle_featured_snippet(snippet): short_answer_div = snippet.find('div', attrs={'data-tts': 'answers'}) short_answer = None if short_answer_div: short_answer = short_answer_div.get_text() short_answer_div.parent.decompose( ) # make it easier to find long answer long_div = snippet.find('div', attrs={'role': 'heading'}) if long_div and long_div.span: long_answer = long_div.span.get_text() return 'feat_snip', short_answer, long_answer else: ol = snippet.find('ol') ul = snippet.find('ul') if ol and not ol.has_attr('role'): # see 6916 and 4143239 long_list = [x.get_text() for x in ol.find_all('li')] return 'rich_list', short_answer, str(long_list) elif ul: