def get_main_stat(url): html = get_page_source(url) soup = BeautifulSoup(html, 'lxml') app_logger.info(f'Start parsing MAIN stat for {url}\n') main_stat = {} try: championate_info = soup.select( 'span.description__country')[0].text main_stat['country'] = championate_info.split(':')[0] main_stat['championate'] = championate_info.split( ':')[1].split('-')[0].strip() main_stat['round_num'] = championate_info.split( ':')[1].split('-')[1].strip() main_stat['date'] = soup.select('div#utime')[0].text.split(' ')[0] main_stat['home_command'] = soup.select( 'div.team-text.tname-home a.participant-imglink')[0].text main_stat['away_command'] = soup.select( 'div.team-text.tname-away a.participant-imglink')[0].text main_stat['result_score'] = soup.select( 'div#event_detail_current_result')[0].text.strip() detail_info = soup.select('div.detailMS')[0] main_stat['goal_minutes'] = get_goal_minutes( detail_info.encode_contents()) except Exception: app_logger.exception(f'Error receiving main stat elements {url}') return main_stat
def insert_stat(html): try: soup = BeautifulSoup(html, 'lxml') HDA_odds, AH_odds, OU_odds = soup.select('div#oddsDetai div')[0:3] HDA_stat = get_data_from_table(HDA_odds.select('tr'), '1x2') AH_stat = get_data_from_table(AH_odds.select('tr')) OU_stat = get_data_from_table(OU_odds.select('tr')) app_logger.debug('Received HDA, AH, OU statistics by minutes') summary_stats = {} summary_stats.update(select_pre_match_line(HDA_stat, '1x2')) summary_stats.update(select_pre_match_line(AH_stat, 'AH')) summary_stats.update(select_pre_match_line(OU_stat, 'OU')) app_logger.debug('Added prematch line move') [ summary_stats.update(stat) for stat in select_stat_in_HT(HDA_stat, '1x2') ] [ summary_stats.update(stat) for stat in select_stat_in_HT(AH_stat, 'AH') ] [ summary_stats.update(stat) for stat in select_stat_in_HT(OU_stat, 'OU') ] summary_stats.update(get_match_info(soup)) app_logger.info( f'Formed objects with stats cnt keys={len(summary_stats.keys())}') except Exception: app_logger.exception('\nError received stats from elements page') insert_into_ng_odds(summary_stats) app_logger.debug('Record values in table\n')
def get_html(url): try: r = requests.get(url, headers={'User-Agent': USER_AGENT}) app_logger.info(f'Received html {url} STATUS {r.status_code}\n') except Exception: app_logger.exception(f'Error receive html {url}\n') if r.ok: return r.text
def get_html(url): user_agent = UserAgent().chrome r = requests.get(url, headers={'User-Agent': user_agent}) if r.ok: app_logger.debug(f'Received html page {url} code = {r.status_code}') return r.text else: app_logger.exception(f'Error getting html page {url} {r.status_code}') print(r.ok)
def run_parse(filename, url): summary_stat = {} try: summary_stat.update(get_live_stat(url)) summary_stat.update(get_past_stat(url)) except Exception: write_text_file('stat_scraper/urls/failed_urls.txt', url) app_logger.exception(f'ERROR RUN PARSE ON URL {url}') data = normalize_data(summary_stat) write_csv(filename, data, data.keys())
def get_page_source(url): try: driver = get_driver() driver.get(url) time.sleep(0.5) html = driver.page_source driver.quit() app_logger.info(f'Received html {url}\n') except Exception: app_logger.exception(f'Error receive html {url}\n') return html
def run_parse(filename, url): summary_stat = {} try: started_at = time.time() summary_stat.update(get_live_stat(url)) summary_stat.update(get_past_stat(url)) ended_at = time.time() except Exception: app_logger.exception(f'ERROR RUN PARSE ON URL {url}') processed_time = round(ended_at - started_at, 4) write_text_file('stat_scraper/logs/time_tracks/processed_1_url.txt', f'{processed_time}\n')
def main(champ_urls): count_records = 0 for champ_url in tqdm(champ_urls): time.sleep(1) try: events_urls = normalize_list_urls( get_events_urls(champ_url + 'results/')) app_logger.info(f'Received {len(events_urls)} events urls') [insert_into_events_urls(event_url) for event_url in (events_urls)] app_logger.info(f'Record in db {len(events_urls)} urls ') count_records += len(events_urls) app_logger.info(f'Total number of records = {count_records}\n') except Exception: app_logger.exception('\nreceive or record error')
def rows_filter(stat_rows, championate, limit=15): filtered_rows = [] for stat_row in stat_rows: if len(filtered_rows) == limit: return filtered_rows try: event_id = stat_row['id'][4:] url = 'https://www.flashscore.com/match/' + event_id soup = BeautifulSoup(get_html(url), 'lxml') elem_champ = soup.select('span.description__country')[ 0].text.split(':')[1].split('-')[0].strip() except Exception: app_logger.exception('Error RECEIVNING INFO FOR ROWS FILTER!!!') if elem_champ == championate: filtered_rows.append(stat_row) return filtered_rows
def get_more_events(url, clicks=12): driver = get_driver() driver.get(url) time.sleep(1) more_event_btn = driver.find_element_by_css_selector('a.event__more') more_event_btn.send_keys(Keys.END) app_logger.info(f'Start CLICKING to show more btn on {url} page') for i in range(clicks): try: time.sleep(1) more_event_btn.click() except Exception: app_logger.exception('Button show more events not found url {url}') html = driver.page_source driver.quit() return html html = driver.page_source driver.quit() return html
def get_summary_stat(stat_rows, command, championate, position, select_type='position'): app_logger.info(f'Start received SUMMARY stats for {command}\n') stat_rows = (find_position_events(stat_rows, command, position) if select_type == 'position' else stat_rows) stat_rows = rows_filter(stat_rows, championate) app_logger.info(f'LEFT AFTER FILTER {len(stat_rows)} rows ') summary_stats = [] for stat_row in stat_rows: event_stat = {} try: home_command = stat_row.select( 'div.event__participant--home')[0].text.strip() away_command = stat_row.select( 'div.event__participant--away')[0].text.strip() event_scores = stat_row.select('div.event__scores span') first_half_scores = stat_row.select( 'div.event__part')[0].text.strip('(').strip(')').split('-') command_id = 0 if command in home_command else 1 event_stat['goals_scored'] = event_scores[command_id].text event_stat['goals_missed'] = event_scores[command_id - 1].text event_stat['1half_goals_scored'] = first_half_scores[command_id] event_stat['1half_goals_missed'] = first_half_scores[command_id - 1] event_id = stat_row['id'][4:] first_half_url = f'https://www.flashscore.com/match/{event_id}/#match-statistics;1' second_half_url = f'https://www.flashscore.com/match/{event_id}/#match-statistics;2' app_logger.info( f'DETAIL STAT {home_command} {event_scores} {away_command}') event_stat.update( get_half_stat(first_half_url, '1st_half', command_id)) event_stat.update( get_half_stat(second_half_url, '2nd_half', command_id)) summary_stats.append(event_stat) except Exception: app_logger.exception( f'\nError received data from stat row {command}') app_logger.debug(f'Formed event stats: \n{event_stat}\n') return (calculate_stat(summary_stats))
def make_file_champ_urls(country_urls, amount_seasons=4): for url in tqdm(country_urls): archive_url = url + 'archive/' driver = get_driver driver.get(archive_url) time.sleep(1) champs_by_years = driver.find_elements_by_css_selector( 'div.leagueTable__season div.leagueTable__seasonName') for i, champ in enumerate(champs_by_years[:amount_seasons + 1]): champ_text = champ.find_element_by_css_selector('a').text season = champ_text.split(' ')[1] country = driver.find_element_by_css_selector( 'h2.tournament').text.split('\n')[1] try: champ_url = champ.find_element_by_css_selector( 'a').get_attribute('href') app_logger.debug( f'received url - {champ_url} by {country} {season}') write_url_in_file(champ_url) except Exception: app_logger.exception( '\nError getting or writing in file element')
def get_data_from_table(trs, type_odds=None): result = [] for tr in trs[2:]: try: tds = tr.select('td') min_match = tds[0].text score = tds[1].text home_odds = tds[2].text draw_or_value = tds[3].text away_odds = tds[4].text status = tds[6].text variable_name = 'draw_odds' if type_odds == '1x2' else 'value' except Exception: app_logger.exception('Error received html element') result.append({ 'min_match': min_match, 'score': score, 'home_odds': home_odds, f'{variable_name}': draw_or_value, 'away_odds': away_odds, 'status': status }) return result