コード例 #1
0
def get_main_stat(url):
    html = get_page_source(url)
    soup = BeautifulSoup(html, 'lxml')
    app_logger.info(f'Start parsing MAIN stat for {url}\n')
    main_stat = {}
    try:
        championate_info = soup.select(
            'span.description__country')[0].text
        main_stat['country'] = championate_info.split(':')[0]
        main_stat['championate'] = championate_info.split(
            ':')[1].split('-')[0].strip()
        main_stat['round_num'] = championate_info.split(
            ':')[1].split('-')[1].strip()
        main_stat['date'] = soup.select('div#utime')[0].text.split(' ')[0]
        main_stat['home_command'] = soup.select(
            'div.team-text.tname-home a.participant-imglink')[0].text
        main_stat['away_command'] = soup.select(
            'div.team-text.tname-away a.participant-imglink')[0].text
        main_stat['result_score'] = soup.select(
            'div#event_detail_current_result')[0].text.strip()
        detail_info = soup.select('div.detailMS')[0]
        main_stat['goal_minutes'] = get_goal_minutes(
            detail_info.encode_contents())
    except Exception:
        app_logger.exception(f'Error receiving main stat elements {url}')
    return main_stat
コード例 #2
0
def get_half_stat(url, half, command_id=None):
    html = get_page_source(url)
    soup = BeautifulSoup(html, 'lxml')
    app_logger.info(f'Start parsing HALF {half} stat for {url}\n')
    half_table = ('div#tab-statistics-1-statistic'
                  if half == '1st_half' else
                  'div#tab-statistics-2-statistic')
    stat_rows = soup.select(f'{half_table} div.statRow')
    half_stats = {}
    for stat_row in stat_rows:
        if command_id is not None:
            title_value = normalize_value(stat_row.select(
                'div.statText.statText--titleValue')[0].text)
            home_value = normalize_value(stat_row.select(
                'div.statText.statText--homeValue')[0].text)
            away_value = normalize_value(stat_row.select(
                'div.statText.statText--awayValue')[0].text)
            half_stats[f'{half}_{title_value}_OWN'] = int(
                [home_value, away_value][command_id])
            half_stats[f'{half}_{title_value}_ENEMY'] = int(
                [home_value, away_value][command_id - 1])
        else:
            title_value = normalize_value(stat_row.select(
                'div.statText.statText--titleValue')[0].text)
            home_value = normalize_value(stat_row.select(
                'div.statText.statText--homeValue')[0].text)
            away_value = normalize_value(stat_row.select(
                'div.statText.statText--awayValue')[0].text)
            half_stats[f'{half}_{title_value}_home'] = int(home_value)
            half_stats[f'{half}_{title_value}_away'] = int(away_value)
    app_logger.debug(f'Received HALF stat:\n{half_stats}\n')
    return half_stats
コード例 #3
0
def insert_stat(html):
    try:
        soup = BeautifulSoup(html, 'lxml')
        HDA_odds, AH_odds, OU_odds = soup.select('div#oddsDetai div')[0:3]
        HDA_stat = get_data_from_table(HDA_odds.select('tr'), '1x2')
        AH_stat = get_data_from_table(AH_odds.select('tr'))
        OU_stat = get_data_from_table(OU_odds.select('tr'))
        app_logger.debug('Received HDA, AH, OU statistics by minutes')
        summary_stats = {}
        summary_stats.update(select_pre_match_line(HDA_stat, '1x2'))
        summary_stats.update(select_pre_match_line(AH_stat, 'AH'))
        summary_stats.update(select_pre_match_line(OU_stat, 'OU'))
        app_logger.debug('Added prematch line move')
        [
            summary_stats.update(stat)
            for stat in select_stat_in_HT(HDA_stat, '1x2')
        ]
        [
            summary_stats.update(stat)
            for stat in select_stat_in_HT(AH_stat, 'AH')
        ]
        [
            summary_stats.update(stat)
            for stat in select_stat_in_HT(OU_stat, 'OU')
        ]
        summary_stats.update(get_match_info(soup))
        app_logger.info(
            f'Formed objects with stats cnt keys={len(summary_stats.keys())}')
    except Exception:
        app_logger.exception('\nError received stats from elements page')
    insert_into_ng_odds(summary_stats)
    app_logger.debug('Record values in table\n')
コード例 #4
0
def get_past_stat(url):
    main_stat = get_main_stat(url)
    html = get_page_source(url)
    soup = BeautifulSoup(html, 'lxml')
    app_logger.info(f'Start collect PAST stat data from {url}\n')
    template = 'https://www.flashscore.com{}/results/'
    home_command_url = template.format(
        soup.select('div.team-text.tname-home a.participant-imglink')
        [0].attrs['onclick'].split("'")[1])
    away_command_url = template.format(
        soup.select('div.team-text.tname-away a.participant-imglink')
        [0].attrs['onclick'].split("'")[1])
    home_prev_events = find_previous_events(home_command_url,
                                            main_stat['date'])
    away_prev_events = find_previous_events(away_command_url,
                                            main_stat['date'])
    past_stat = {}
    home_past_stat = add_type_command(
        get_summary_stat(home_prev_events, main_stat['home_command'],
                         main_stat['championate'], 'home'), 'HOME')
    away_past_stat = add_type_command(
        get_summary_stat(away_prev_events, main_stat['away_command'],
                         main_stat['championate'], 'away'), 'AWAY')
    past_stat.update(home_past_stat)
    past_stat.update(away_past_stat)
    app_logger.debug('Formed PAST STAT')
    return past_stat
コード例 #5
0
def get_html(url):
    try:
        r = requests.get(url, headers={'User-Agent': USER_AGENT})
        app_logger.info(f'Received html {url} STATUS {r.status_code}\n')
    except Exception:
        app_logger.exception(f'Error receive html {url}\n')
    if r.ok:
        return r.text
コード例 #6
0
def get_page_source(url):
    try:
        driver = get_driver()
        driver.get(url)
        time.sleep(0.5)
        html = driver.page_source
        driver.quit()
        app_logger.info(f'Received html {url}\n')
    except Exception:
        app_logger.exception(f'Error receive html {url}\n')
    return html
コード例 #7
0
def run_multi_parse(urls, n_proc):
    app_logger.info(f'Start multiprocess function urls - {len(urls)} num processes - {n_proc}')
    pool = Pool(n_proc)
    filename = f'stat_scraper/scripts/fs_foot_stat_{SERVER_NAME}.csv'
    func = partial(run_parse, filename)
    pool.map(func, urls)
    pool.close()
    pool.join()
    return {
        'process_type': 'multiprocessing',
        'worker_amount': n_proc,
        'urls_count': len(urls)
    }
コード例 #8
0
def get_more_events(url, clicks=12):
    driver = get_driver()
    driver.get(url)
    time.sleep(1)
    more_event_btn = driver.find_element_by_css_selector('a.event__more')
    more_event_btn.send_keys(Keys.END)
    app_logger.info(f'Start CLICKING to show more btn on {url} page')
    for i in range(clicks):
        try:
            time.sleep(1)
            more_event_btn.click()
        except Exception:
            app_logger.exception('Button show more events not found url {url}')
            html = driver.page_source
            driver.quit()
            return html
    html = driver.page_source
    driver.quit()
    return html
コード例 #9
0
def run_multi_parse(urls, n_proc):
    app_logger.info(
        f'Start multiprocess function urls - {len(urls)} num processes - {n_proc}'
    )
    print(
        f'Start multiprocess function urls - {len(urls)} num processes - {n_proc}'
    )
    pool = Pool(n_proc)
    filename = f'tests/{n_proc}proc_result.txt'
    write_text_file('stat_scraper/logs/time_tracks/processed_1_url.txt',
                    f'amount processors{n_proc} amount url {len(urls)}\n')
    func = partial(run_parse, filename)
    pool.map(func, urls)
    pool.close()
    pool.join()
    return {
        'process_type': 'multiprocessing',
        'worker_amount': n_proc,
        'urls_count': len(urls)
    }
コード例 #10
0
def calculate_stat(stats):
    app_logger.info('Start CALCULATING collected past stats')
    slices = [3, 5, 10, 15, 20]
    sums = {}
    slice_stats = {}
    for i, stat in enumerate(stats):
        keys = stat.keys()
        for key in keys:
            if key in sums:
                sums[key] += int(stat[key])
            else:
                sums[key] = int(stat[key])
        if i + 1 in slices:
            keys = sums.keys()
            values = sums.values()
            [
                slice_stats.update({f'{i+1}_last_{key}': value})
                for key, value in zip(keys, values)
            ]
    app_logger.debug(f'Formed slice stats: \n{slice_stats}\n')
    return slice_stats
コード例 #11
0
def find_previous_events(url, event_date):
    html = get_more_events(url)
    soup = BeautifulSoup(html, 'lxml')
    app_logger.info(f'Start finding PREV events for {event_date}\n')
    stat_rows = soup.select('div.event__match')
    last_date = stat_rows[-1].select('div.event__time')[0].text.split(' ')[0]
    app_logger.info(f'LAST date in received stat rows = {last_date}\n')
    for stat_row in stat_rows:
        date = stat_row.select('div.event__time')[0].text.split(' ')
        date_parts = date[0].split('.')
        date_format = date_parts[0] + \
            date_parts[1] if len(date) > 1 else '.'.join(date_parts)
        event_date_parts = event_date.split('.')
        event_date_format = event_date_parts[0] + \
            event_date_parts[1] if len(date) > 1 else event_date
        if date_format == event_date_format:
            prev_events = stat_row.find_next_siblings(
                name='div', attrs={'class': 'event__match'})
            app_logger.debug(
                f'Found previous {len(prev_events)} events earlier {event_date}\n'
            )
            return prev_events
    return []
コード例 #12
0
def main(champ_urls):
    count_records = 0
    for champ_url in tqdm(champ_urls):
        time.sleep(1)
        try:
            events_urls = normalize_list_urls(
                get_events_urls(champ_url + 'results/'))
            app_logger.info(f'Received {len(events_urls)} events urls')
            [insert_into_events_urls(event_url) for event_url in (events_urls)]
            app_logger.info(f'Record in db {len(events_urls)} urls ')
            count_records += len(events_urls)
            app_logger.info(f'Total number of records = {count_records}\n')
        except Exception:
            app_logger.exception('\nreceive or record error')
コード例 #13
0
def get_summary_stat(stat_rows,
                     command,
                     championate,
                     position,
                     select_type='position'):
    app_logger.info(f'Start received SUMMARY stats for {command}\n')
    stat_rows = (find_position_events(stat_rows, command, position)
                 if select_type == 'position' else stat_rows)
    stat_rows = rows_filter(stat_rows, championate)
    app_logger.info(f'LEFT AFTER FILTER {len(stat_rows)} rows ')
    summary_stats = []
    for stat_row in stat_rows:
        event_stat = {}
        try:
            home_command = stat_row.select(
                'div.event__participant--home')[0].text.strip()
            away_command = stat_row.select(
                'div.event__participant--away')[0].text.strip()
            event_scores = stat_row.select('div.event__scores span')
            first_half_scores = stat_row.select(
                'div.event__part')[0].text.strip('(').strip(')').split('-')
            command_id = 0 if command in home_command else 1
            event_stat['goals_scored'] = event_scores[command_id].text
            event_stat['goals_missed'] = event_scores[command_id - 1].text
            event_stat['1half_goals_scored'] = first_half_scores[command_id]
            event_stat['1half_goals_missed'] = first_half_scores[command_id -
                                                                 1]
            event_id = stat_row['id'][4:]
            first_half_url = f'https://www.flashscore.com/match/{event_id}/#match-statistics;1'
            second_half_url = f'https://www.flashscore.com/match/{event_id}/#match-statistics;2'
            app_logger.info(
                f'DETAIL STAT {home_command} {event_scores} {away_command}')
            event_stat.update(
                get_half_stat(first_half_url, '1st_half', command_id))
            event_stat.update(
                get_half_stat(second_half_url, '2nd_half', command_id))
            summary_stats.append(event_stat)
        except Exception:
            app_logger.exception(
                f'\nError received data from stat row {command}')
        app_logger.debug(f'Formed event stats: \n{event_stat}\n')
    return (calculate_stat(summary_stats))