def get_main_stat(url): html = get_page_source(url) soup = BeautifulSoup(html, 'lxml') app_logger.info(f'Start parsing MAIN stat for {url}\n') main_stat = {} try: championate_info = soup.select( 'span.description__country')[0].text main_stat['country'] = championate_info.split(':')[0] main_stat['championate'] = championate_info.split( ':')[1].split('-')[0].strip() main_stat['round_num'] = championate_info.split( ':')[1].split('-')[1].strip() main_stat['date'] = soup.select('div#utime')[0].text.split(' ')[0] main_stat['home_command'] = soup.select( 'div.team-text.tname-home a.participant-imglink')[0].text main_stat['away_command'] = soup.select( 'div.team-text.tname-away a.participant-imglink')[0].text main_stat['result_score'] = soup.select( 'div#event_detail_current_result')[0].text.strip() detail_info = soup.select('div.detailMS')[0] main_stat['goal_minutes'] = get_goal_minutes( detail_info.encode_contents()) except Exception: app_logger.exception(f'Error receiving main stat elements {url}') return main_stat
def get_half_stat(url, half, command_id=None): html = get_page_source(url) soup = BeautifulSoup(html, 'lxml') app_logger.info(f'Start parsing HALF {half} stat for {url}\n') half_table = ('div#tab-statistics-1-statistic' if half == '1st_half' else 'div#tab-statistics-2-statistic') stat_rows = soup.select(f'{half_table} div.statRow') half_stats = {} for stat_row in stat_rows: if command_id is not None: title_value = normalize_value(stat_row.select( 'div.statText.statText--titleValue')[0].text) home_value = normalize_value(stat_row.select( 'div.statText.statText--homeValue')[0].text) away_value = normalize_value(stat_row.select( 'div.statText.statText--awayValue')[0].text) half_stats[f'{half}_{title_value}_OWN'] = int( [home_value, away_value][command_id]) half_stats[f'{half}_{title_value}_ENEMY'] = int( [home_value, away_value][command_id - 1]) else: title_value = normalize_value(stat_row.select( 'div.statText.statText--titleValue')[0].text) home_value = normalize_value(stat_row.select( 'div.statText.statText--homeValue')[0].text) away_value = normalize_value(stat_row.select( 'div.statText.statText--awayValue')[0].text) half_stats[f'{half}_{title_value}_home'] = int(home_value) half_stats[f'{half}_{title_value}_away'] = int(away_value) app_logger.debug(f'Received HALF stat:\n{half_stats}\n') return half_stats
def insert_stat(html): try: soup = BeautifulSoup(html, 'lxml') HDA_odds, AH_odds, OU_odds = soup.select('div#oddsDetai div')[0:3] HDA_stat = get_data_from_table(HDA_odds.select('tr'), '1x2') AH_stat = get_data_from_table(AH_odds.select('tr')) OU_stat = get_data_from_table(OU_odds.select('tr')) app_logger.debug('Received HDA, AH, OU statistics by minutes') summary_stats = {} summary_stats.update(select_pre_match_line(HDA_stat, '1x2')) summary_stats.update(select_pre_match_line(AH_stat, 'AH')) summary_stats.update(select_pre_match_line(OU_stat, 'OU')) app_logger.debug('Added prematch line move') [ summary_stats.update(stat) for stat in select_stat_in_HT(HDA_stat, '1x2') ] [ summary_stats.update(stat) for stat in select_stat_in_HT(AH_stat, 'AH') ] [ summary_stats.update(stat) for stat in select_stat_in_HT(OU_stat, 'OU') ] summary_stats.update(get_match_info(soup)) app_logger.info( f'Formed objects with stats cnt keys={len(summary_stats.keys())}') except Exception: app_logger.exception('\nError received stats from elements page') insert_into_ng_odds(summary_stats) app_logger.debug('Record values in table\n')
def get_past_stat(url): main_stat = get_main_stat(url) html = get_page_source(url) soup = BeautifulSoup(html, 'lxml') app_logger.info(f'Start collect PAST stat data from {url}\n') template = 'https://www.flashscore.com{}/results/' home_command_url = template.format( soup.select('div.team-text.tname-home a.participant-imglink') [0].attrs['onclick'].split("'")[1]) away_command_url = template.format( soup.select('div.team-text.tname-away a.participant-imglink') [0].attrs['onclick'].split("'")[1]) home_prev_events = find_previous_events(home_command_url, main_stat['date']) away_prev_events = find_previous_events(away_command_url, main_stat['date']) past_stat = {} home_past_stat = add_type_command( get_summary_stat(home_prev_events, main_stat['home_command'], main_stat['championate'], 'home'), 'HOME') away_past_stat = add_type_command( get_summary_stat(away_prev_events, main_stat['away_command'], main_stat['championate'], 'away'), 'AWAY') past_stat.update(home_past_stat) past_stat.update(away_past_stat) app_logger.debug('Formed PAST STAT') return past_stat
def get_html(url): try: r = requests.get(url, headers={'User-Agent': USER_AGENT}) app_logger.info(f'Received html {url} STATUS {r.status_code}\n') except Exception: app_logger.exception(f'Error receive html {url}\n') if r.ok: return r.text
def get_page_source(url): try: driver = get_driver() driver.get(url) time.sleep(0.5) html = driver.page_source driver.quit() app_logger.info(f'Received html {url}\n') except Exception: app_logger.exception(f'Error receive html {url}\n') return html
def run_multi_parse(urls, n_proc): app_logger.info(f'Start multiprocess function urls - {len(urls)} num processes - {n_proc}') pool = Pool(n_proc) filename = f'stat_scraper/scripts/fs_foot_stat_{SERVER_NAME}.csv' func = partial(run_parse, filename) pool.map(func, urls) pool.close() pool.join() return { 'process_type': 'multiprocessing', 'worker_amount': n_proc, 'urls_count': len(urls) }
def get_more_events(url, clicks=12): driver = get_driver() driver.get(url) time.sleep(1) more_event_btn = driver.find_element_by_css_selector('a.event__more') more_event_btn.send_keys(Keys.END) app_logger.info(f'Start CLICKING to show more btn on {url} page') for i in range(clicks): try: time.sleep(1) more_event_btn.click() except Exception: app_logger.exception('Button show more events not found url {url}') html = driver.page_source driver.quit() return html html = driver.page_source driver.quit() return html
def run_multi_parse(urls, n_proc): app_logger.info( f'Start multiprocess function urls - {len(urls)} num processes - {n_proc}' ) print( f'Start multiprocess function urls - {len(urls)} num processes - {n_proc}' ) pool = Pool(n_proc) filename = f'tests/{n_proc}proc_result.txt' write_text_file('stat_scraper/logs/time_tracks/processed_1_url.txt', f'amount processors{n_proc} amount url {len(urls)}\n') func = partial(run_parse, filename) pool.map(func, urls) pool.close() pool.join() return { 'process_type': 'multiprocessing', 'worker_amount': n_proc, 'urls_count': len(urls) }
def calculate_stat(stats): app_logger.info('Start CALCULATING collected past stats') slices = [3, 5, 10, 15, 20] sums = {} slice_stats = {} for i, stat in enumerate(stats): keys = stat.keys() for key in keys: if key in sums: sums[key] += int(stat[key]) else: sums[key] = int(stat[key]) if i + 1 in slices: keys = sums.keys() values = sums.values() [ slice_stats.update({f'{i+1}_last_{key}': value}) for key, value in zip(keys, values) ] app_logger.debug(f'Formed slice stats: \n{slice_stats}\n') return slice_stats
def find_previous_events(url, event_date): html = get_more_events(url) soup = BeautifulSoup(html, 'lxml') app_logger.info(f'Start finding PREV events for {event_date}\n') stat_rows = soup.select('div.event__match') last_date = stat_rows[-1].select('div.event__time')[0].text.split(' ')[0] app_logger.info(f'LAST date in received stat rows = {last_date}\n') for stat_row in stat_rows: date = stat_row.select('div.event__time')[0].text.split(' ') date_parts = date[0].split('.') date_format = date_parts[0] + \ date_parts[1] if len(date) > 1 else '.'.join(date_parts) event_date_parts = event_date.split('.') event_date_format = event_date_parts[0] + \ event_date_parts[1] if len(date) > 1 else event_date if date_format == event_date_format: prev_events = stat_row.find_next_siblings( name='div', attrs={'class': 'event__match'}) app_logger.debug( f'Found previous {len(prev_events)} events earlier {event_date}\n' ) return prev_events return []
def main(champ_urls): count_records = 0 for champ_url in tqdm(champ_urls): time.sleep(1) try: events_urls = normalize_list_urls( get_events_urls(champ_url + 'results/')) app_logger.info(f'Received {len(events_urls)} events urls') [insert_into_events_urls(event_url) for event_url in (events_urls)] app_logger.info(f'Record in db {len(events_urls)} urls ') count_records += len(events_urls) app_logger.info(f'Total number of records = {count_records}\n') except Exception: app_logger.exception('\nreceive or record error')
def get_summary_stat(stat_rows, command, championate, position, select_type='position'): app_logger.info(f'Start received SUMMARY stats for {command}\n') stat_rows = (find_position_events(stat_rows, command, position) if select_type == 'position' else stat_rows) stat_rows = rows_filter(stat_rows, championate) app_logger.info(f'LEFT AFTER FILTER {len(stat_rows)} rows ') summary_stats = [] for stat_row in stat_rows: event_stat = {} try: home_command = stat_row.select( 'div.event__participant--home')[0].text.strip() away_command = stat_row.select( 'div.event__participant--away')[0].text.strip() event_scores = stat_row.select('div.event__scores span') first_half_scores = stat_row.select( 'div.event__part')[0].text.strip('(').strip(')').split('-') command_id = 0 if command in home_command else 1 event_stat['goals_scored'] = event_scores[command_id].text event_stat['goals_missed'] = event_scores[command_id - 1].text event_stat['1half_goals_scored'] = first_half_scores[command_id] event_stat['1half_goals_missed'] = first_half_scores[command_id - 1] event_id = stat_row['id'][4:] first_half_url = f'https://www.flashscore.com/match/{event_id}/#match-statistics;1' second_half_url = f'https://www.flashscore.com/match/{event_id}/#match-statistics;2' app_logger.info( f'DETAIL STAT {home_command} {event_scores} {away_command}') event_stat.update( get_half_stat(first_half_url, '1st_half', command_id)) event_stat.update( get_half_stat(second_half_url, '2nd_half', command_id)) summary_stats.append(event_stat) except Exception: app_logger.exception( f'\nError received data from stat row {command}') app_logger.debug(f'Formed event stats: \n{event_stat}\n') return (calculate_stat(summary_stats))