driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver') # iteratively load website / raw table data for offset in range(OFFSET_RANGE[0], OFFSET_RANGE[1], 100): driver.get(HLTV_BASE_URL + str(offset)) driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # transcribe raw html to condensed tabular data headers = driver.find_elements_by_class_name('standard-headline') header_text = [header.text for header in headers] match_time = calc_average_header_date(header_text) table = driver.find_elements_by_class_name('result') result_text = [row.text for row in table] match_data = transcribe_table_data(result_text, match_time) logger.info('Finished processing of %s rows for an offset of %s.', len(table), offset) # insert to db if ENVIRONMENT == 'PRODUCTION' and len(table) > 0: logger.info('Upserting %s rows into database.', len(table)) postgres_db_upsert(match_data, DB_CREDENTIALS) elif len(table) == 0: logger.warning('HLTV data scrape produced 0 data points.') else: logger.info('Produced data: %s', table) # sleep to not spam website time.sleep(random.uniform(1, 3))
if __name__ == '__main__': logger.info('Starting scrape job for rivalry table data.') # initialize headless selenium webdriver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome(chrome_options=chrome_options) # load website / raw table data driver.get(RIVALRY_URL) time.sleep(5) # give webpage time to load table driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # scroll down to load dynamic content time.sleep(1) table = driver.find_element_by_id('__nuxt') table = table.text.split('\n') table = transcribe_table_data(table) logger.info('Finished processing of %s rows.', len(table)) # insert to db if ENVIRONMENT == 'PRODUCTION' and len(table) > 0: logger.info('Inserting %s rows into database.', len(table)) postgres_db_insert(table, DB_CREDENTIALS) elif len(table) == 0: logger.warning('EGB data scrape produced 0 data points.') else: logger.info('Produced data: %s', table)
# load website driver.get(GGBET_URL) html = driver.page_source time.sleep(5) # give webpage time to load table driver.execute_script("window.scrollTo(0, document.body.scrollHeight);" ) # scroll down to load dynamic content time.sleep(1) # transcribe data table table = driver.find_element_by_id('betting__container').text soup = BeautifulSoup(table, 'html.parser') table_text = remove_header(soup.text) table_text = insert_row_breaks(table_text) table_rows = table_text.split('_ROW_BREAK_') formatted_data = transcribe_table_data(table_rows)[1:] if len(formatted_data) == 1: logger.info('Finished processing %s row', len(formatted_data)) else: logger.info('Finished processing %s rows', len(formatted_data)) # insert to db if ENVIRONMENT == "PRODUCTION": if len(formatted_data) > 0: logger.info('Inserting %s rows into database', len(formatted_data)) postgres_db_insert(formatted_data, DB_CREDENTIALS) else: logger.warning('GGBET data scrape produced 0 data points') elif ENVIRONMENT == "DEVELOPMENT": logger.info('Produced data: %s', table) else: