Ejemplo n.º 1
0
def load_fixture_from_fs(fl, forecast_days):
    log.debug("Enter")
    chrome_path = fl.get_chrome_driver()
    driver = webdriver.Chrome(chrome_path)
    driver.get('https://www.flashscore.com/')
    random1 = random.randint(1, 21)
    random2 = random.randint(1, 11)
    random3 = random.randint(random1, random2 +
                             5) if random2 >= random1 else random.randint(
                                 random2, random1 + 5)
    fixture_file = os.path.join(fl.get_data_path(), fl.get_fixture_file())

    try:

        tabs = WebDriverWait(
            driver,
            60).until(lambda d: d.find_elements_by_class_name('tabs__text'))
        for tab in tabs:
            if (tab.text == 'Odds'):
                link = tab

        link.click()
        time.sleep(random1)
        calendar_nav = driver.find_elements_by_class_name(
            'calendar__direction')
        today = datetime.date.today()
        ##we will get the data for the next 7 days
        if (os.path.isfile(fixture_file)):
            os.remove(fixture_file)

        for day in range(forecast_days):
            for nav in calendar_nav:
                class_values = nav.get_attribute('class').split(' ')
                if (class_values[1].strip() == 'calendar__direction--tomorrow'
                    ):
                    nav.click()
                    time.sleep(random2)
                    tabs = WebDriverWait(
                        driver,
                        53).until(lambda d: d.find_element_by_class_name(
                            'event__header'))
                    time.sleep(random3)
                    html_source = driver.page_source
                    match_date = today + datetime.timedelta(days=day)
                    scrape_fixture_from_fs(fl, html_source, match_date,
                                           fixture_file)

    except Exception:
        log.exception("Exception occurred while loading flashscore page")
        driver.close()
        exit()

    driver.close()
    log.debug("Exit")
Ejemplo n.º 2
0
def load_latest_matches_from_op(fl):
    log.debug("Enter")
    chrome_path = fl.get_chrome_driver()
    driver = webdriver.Chrome(chrome_path)
    base_url = 'https://www.oddsportal.com/soccer/'
    driver.implicitly_wait(60)

    try:

        for key, div in nccprops.fs_country_league_to_div_map.items():
            if (nccprops.op_country_league_to_div_map_load.get(key, -1) == -1):
                continue
            retry = 0
            while (retry in (0,1)):
                try:
                    op_country, op_league = nccprops.op_country_league_to_div_map[key]
                    log.info('Loading [%s] [%s] -- Retry == %s' % (op_country,op_league, retry))
                    url_to_load = base_url + op_country + "/" + op_league + "/" + 'results/'
                    random1 = random.randint(10, 21)
                    random2 = random.randint(10, 21)
                    random3 = random.randint(random1, random2 + 5) if random2 >= random1 else random.randint(random2, random1 + 5)
                    driver.get(url_to_load)
                    time.sleep(random1)
                    html_source = driver.page_source
                    season = nccprops.get_current_season(key[0])
                    scrape_latest_matches_from_op(fl, html_source, key[0], key[1].replace('/', '-'), season, div)
                    time.sleep(random3)
                    retry = 2

                except Exception:
                    if (retry == 0):
                        retry = 1
                    else:
                        retry = 2
                    continue
    except Exception:
        log.exception("Exception occurred while loading flashscore page")
        driver.close()

    log.debug("Exit")
Ejemplo n.º 3
0
def scrape_latest_matches_from_op(fl, html_source, country, league, season, div):
    log.debug("Enter")
    match_list = list()
    odds_portal_file_path = fl.get_oddsportal_dir()

    try:
        #with open(r'E:\ncc-data\test\Allsvenskan.html') as html_source:
        soup = bs(html_source, 'lxml')

        tournament_table = soup.find('table', class_='table-main')
        body_ = tournament_table.find('tbody')
        all_matches = body_.find_all('tr')

        match_date = ""
        for _class_ in all_matches:
            # print (_class_)
            if (_class_['class'][0] == 'center'):
                match_date = formatterutil.format_date(_class_.text.split('1X2')[0], 'op')

            if (match_date != ''):
                if ('deactivate' in _class_['class']):
                    odds_count = 0
                    FTHG, FTAG = 0, 0
                    B365H, B365D, B365A = 0, 0, 0
                    FTR = 'NA'
                    for child in _class_.children:
                        if ('table-time' in child['class']):
                            match_time = formatterutil.format_time(child.text.strip(), 'op')
                        elif('table-participant' in child['class']):
                            try:
                                split_team = child.text.split(' - ')
                                HomeTeam = split_team[0].strip()
                                AwayTeam = split_team[1].strip()
                            except:
                                break
                        elif ('odds-nowrp' in child['class']):
                            try:
                                if (odds_count == 0):
                                    odds_count += 1
                                    B365H = round(float(child.text),2)
                                elif (odds_count == 1):
                                    odds_count += 1
                                    B365D = round(float(child.text),2)
                                else:
                                    odds_count += 1
                                    B365A = round(float(child.text),2)
                            except:
                                continue

                        elif ('table-score' in child['class']):
                            try:
                                split_score = child.text.split(':')
                                FTHG = int(split_score[0])
                                FTAG = int(split_score[1])
                                FTR = 'H' if FTHG > FTAG else ('D' if FTHG == FTAG else 'A')
                            except:
                                break

                    match_dic = {'Country': country, 'League': league, 'Div': div, 'Season': season,
                                   'Date': match_date, 'Time': match_time, 'HomeTeam': HomeTeam,  'AwayTeam': AwayTeam,
                                   'FTHG': FTHG, 'FTAG': FTAG, 'FTR': FTR,
                                   'B365H': B365H, 'B365D': B365D, 'B365A': B365A}

                    match_list.append(match_dic)

        latest_matches_df = pd.DataFrame(match_list)

        file_path = os.path.join(odds_portal_file_path,country.upper(),str(season),str(div),str(league))
        if not (os.path.isdir(file_path)):
            os.makedirs(file_path)
        latest_matches_df.to_csv(os.path.join(file_path, 'latest.csv'), mode='w+')

    except Exception:
        log.exception("Exception occurred while matches from odds portal")

    log.debug("Exit")
Ejemplo n.º 4
0
def load_historical_matches_from_op(fl):
    log.debug("Enter")
    odds_portal_file_path = fl.get_oddsportal_dir()
    chrome_path = fl.get_chrome_driver()
    driver = webdriver.Chrome(chrome_path)
    base_url = 'https://www.oddsportal.com/soccer/'
    driver.implicitly_wait(60)
    season_list = nccprops.season_list
    try:
        for key, div in nccprops.fs_country_league_to_div_map.items():
            if (nccprops.op_country_league_to_div_map_load.get(key, -1) == -1):
                continue
            retry = 0
            country = key[0].upper()
            league = key[1].replace('/', '-')
            op_country, op_league = nccprops.op_country_league_to_div_map[key]
            url_to_load = base_url + op_country + "/" + op_league + "/" + 'results/'
            ## these random sleep time is to slow things down and introduce random behavior during scraping process
            random1 = random.randint(11, 19)
            random2 = random.randint(7, 17)
            random3 = random.randint(random1, random2 + 5) if random2 >= random1 else random.randint(random2, random1 + 5)
            driver.get(url_to_load)
            seasons = driver.find_elements_by_class_name('main-filter')[1]
            ## findout what date format we have
            st_date = str(season_list[0])
            slash_date = str(int(st_date) - 1) + "/" + st_date
            try:
                date_format = 'NA'
                if (seasons.find_element_by_link_text(slash_date)):
                    date_format = 'slash'
                else:
                    date_format = 'NA'
            except:
                try:
                    if (seasons.find_element_by_link_text(st_date)):
                        date_format = 'st'
                    else:
                        date_format = 'NA'
                except:
                    log.error('Failed identifying date format for [%s] [%s]' % (op_country, op_league))
                    continue
            log.info('Loading [%s] [%s] Date Format is [%s] -- Retry == %s' % (op_country, op_league, date_format, retry))
            for season in season_list:
                file_path = os.path.join(odds_portal_file_path,country,str(season),str(div),league)
                if (date_format == 'slash'):
                    page_season = str(int(season) - 1) + "/" + season
                elif (date_format == 'st'):
                    page_season = str(season)
                elif (date_format == 'NA'):
                    log.error('Invalid date format for [%s] [%s]' % (op_country, op_league))
                    break
                try:
                    seasons = driver.find_elements_by_class_name('main-filter')[1]
                    time.sleep(random.randint(7,11))
                    season_link = seasons.find_element_by_link_text(page_season)
                    season_link.click()

                except:
                    continue

                ## create directories if required
                if not (os.path.isdir(file_path)):
                    os.makedirs(file_path)
                file_path = os.path.join(file_path,'history.csv')
                if (os.path.isfile(file_path)):
                    ## remove the file so that we can start all over
                    os.remove(file_path)

                count = 1
                while count < 20:
                    log.info("-->Scraping Season [%s] Page [%s]" % (season, count))
                    time.sleep(random.randint(13,23))
                    html_source = driver.page_source
                    scrape_historical_matches_from_op(fl, html_source, file_path, country, league, div, season, count)
                    count += 1
                    try:
                        pages = driver.find_element_by_id('pagination')
                        time.sleep(random.randint(5,10))
                        page = pages.find_element_by_link_text(str(count))
                        time.sleep(random.randint(5,9))
                        page.click()
                        time.sleep(7)
                    except Exception:
                        break
                time.sleep(random3)

        driver.close()
        exit()
    except Exception:
        log.exception("Exception occurred while loading flashscore page")
        driver.close()
    log.debug("Exit")
Ejemplo n.º 5
0
def scrape_historical_matches_from_op(fl, html_source, file_path, country, league, div, season, count):
    log.debug("Enter")
    match_list = list()
    try:
        soup = bs(html_source, 'lxml')
        tournament_table = soup.find('table', class_='table-main')
        body_ = tournament_table.find('tbody')
        all_matches = body_.find_all('tr')
        match_date = ""
        for _class_ in all_matches:
            if (_class_['class'][0] == 'center'):
                match_date = formatterutil.format_date(_class_.text.split('1X2')[0], 'op')

            if (match_date != ''):
                if ('deactivate' in _class_['class']):
                    odds_count = 0
                    FTHG, FTAG = 0, 0
                    B365H, B365D, B365A = 0, 0, 0
                    FTR = 'NA'
                    for child in _class_.children:
                        if ('table-time' in child['class']):
                            match_time = formatterutil.format_time(child.text.strip(), 'op')
                        elif('table-participant' in child['class']):
                            try:
                                split_team = child.text.split(' - ')
                                HomeTeam = split_team[0].strip()
                                AwayTeam = split_team[1].strip()
                            except:
                                break
                        elif ('odds-nowrp' in child['class']):
                            try:
                                if (odds_count == 0):
                                    odds_count += 1
                                    B365H = round(float(child.text),2)
                                elif (odds_count == 1):
                                    odds_count += 1
                                    B365D = round(float(child.text),2)
                                else:
                                    odds_count += 1
                                    B365A = round(float(child.text),2)
                            except:
                                B365H = 0
                                B365D = 0
                                B365A = 0

                        elif ('table-score' in child['class']):
                            try:
                                split_score = child.text.split(':')
                                FTHG = int(split_score[0])
                                FTAG = int(split_score[1])
                                FTR = 'H' if FTHG > FTAG else ('D' if FTHG == FTAG else 'A')
                            except:
                                break

                    match_dic = {'Country': country, 'League': league, 'Div': div, 'Season': season,
                                   'Date': match_date, 'Time': match_time, 'HomeTeam': HomeTeam,  'AwayTeam': AwayTeam,
                                   'FTHG': FTHG, 'FTAG': FTAG, 'FTR': FTR,
                                   'B365H': B365H, 'B365D': B365D, 'B365A': B365A}


                    match_list.append(match_dic)

        history_matches_df = pd.DataFrame(match_list)
        if (count == 1):
            history_matches_df.to_csv(file_path, mode='w+')
        else:
            history_matches_df.to_csv(file_path, mode='a+', header = False)


    except Exception:
        log.exception("Exception occurred while loading matches from odds portal for country %s and league %s" %(country, league))
        exit()

    log.debug("Exit")
Ejemplo n.º 6
0
def load_league_standing_from_fs(fl, number_of_seasons):
    log.debug("Enter")
    chrome_path = fl.get_chrome_driver()
    driver = webdriver.Chrome(chrome_path)
    driver.implicitly_wait(120)

    fs_country_leagues = nccprops.fs_country_league_to_div_map.keys()
    count = 0
    try:
        for country, league in fs_country_leagues:
            random1 = random.randint(1, 21)
            random2 = random.randint(1, 11)
            random3 = random.randint(
                random1, random2 +
                5) if random2 >= random1 else random.randint(
                    random2, random1 + 5)

            time.sleep(random1)
            league = league.replace('/', '-')

            ## create the fs url
            fs_league = league.replace('.', '-')
            fs_league = fs_league.replace(' -', '-').replace('- ', '-')
            fs_league = fs_league.replace(' ', '-').lower()
            fs_country = country.strip().replace(' ', '-')
            country_league_url = 'https://www.flashscore.com/football/' + fs_country.lower(
            ) + "/" + fs_league + "/"
            driver.get(country_league_url)
            seasons_count = 0
            archive_clicked = False
            log.debug("Loading standing for Country %s League %s - URL %s" %
                      (country, league, country_league_url))
            for i in range(number_of_seasons * 2):

                if (seasons_count > number_of_seasons):
                    break

                if not archive_clicked:
                    archive_clicked = True
                    time.sleep(random.randint(1, 5))
                    archive = driver.find_element_by_id('li4')
                    archive.click()
                    seasons = driver.find_elements_by_class_name(
                        'leagueTable__seasonName')
                    time.sleep(random2)

                try:
                    if i == 0:
                        seasons[i].click()
                        seasons_count += 1
                        archive_clicked = False
                    else:
                        parent_element = seasons[i].find_element_by_xpath('..')
                        parent_name = parent_element.get_attribute(
                            'class').split(' ')
                        if (parent_name[0] == 'leagueTable__season'):
                            ## incrementing by one since there are two elements by same class and we want every second one
                            seasons[i].click()
                            seasons_count += 1
                            archive_clicked = False
                        else:
                            continue

                except Exception:
                    break

                time.sleep(random3)
                html_source = driver.page_source
                #print (html_source)
                scrape_league_standing_from_fs(fl, html_source, country,
                                               league)

    except Exception:
        log.exception("Exception occurred. Exiting...")
        driver.close()
        exit()
    driver.close()

    log.debug("Exit")
Ejemplo n.º 7
0
def scrape_league_standing_from_fs(fl, html_source, country, league):
    log.debug("Enter")
    league_standing_dir = fl.get_league_standing_dir()

    try:
        league_list = list()

        #with open (r'E:\ncc-data\test\FlashScore_summary.html') as html_source:
        soup = bs(html_source, 'lxml')

        season_element = soup.find('div', class_='teamHeader__text')
        season = season_element.text
        ## format this season
        season = season.replace(" ", "").strip()
        div = nccprops.fs_country_league_to_div_map[(country, league)]
        if (nccprops.SEASON_MAP.get(season, -1) == -1):
            ## no need for mapping or this format is not present. for the timebeing let proceed as if this is a good one
            pass
        else:
            season = nccprops.SEASON_MAP[season]

        fs_league_table = soup.find_all('div', class_='row___S6WkQ8-')
        log.debug(
            "Scraping standing for Country %s League (formatted) %s Div %s Season %s"
            % (country, league, div, season))
        for team in fs_league_table:
            try:
                team_stats = list()
                for i, stat in enumerate(team.children):
                    try:
                        ## this is for the scored and conceded goals
                        if (i == 6):
                            goal_stat = stat.text.split(':')
                            team_stats.append(goal_stat[0])
                            team_stats.append(goal_stat[1])
                        elif (i == 0):
                            rank = stat.text.split('.')
                            team_stats.append(rank[0])
                        elif (i == 1):
                            team = stat.text.strip()
                            team_alias = nccprops.get_fs_team_alias(
                                country.upper(), team)
                            team_stats.append(team)
                            team_stats.append(team_alias)
                        else:
                            team_stats.append(stat.text.strip())
                    except Exception:
                        continue

                league_list.append(team_stats)

            except Exception:
                continue

        log.debug("Loading league %s" % league_list)
        league = league.replace('/', '-')
        league_df = pd.DataFrame(league_list,
                                 columns=[
                                     'Rank', 'Team', 'Team_Alias',
                                     'MatchesPlayed', 'Win', 'Draw', 'Loss',
                                     'GoalsScored', 'GoalsConceded', 'Points',
                                     'Form'
                                 ])
        league_df['Country'] = country
        league_df['League'] = league
        league_df['Div'] = div
        league_df['Season'] = season
        ## note that the PK was created with league originally.
        formatterutil.remove_special_chars_from_df(league_df)
        league_df['PK'] = league_df['Country'].astype(str) + "__" + league_df[
            'Div'].astype(str) + "__" + league_df['Season'].astype(
                str) + "__" + league_df['Team_Alias'].astype(str)
        ## reset the index
        league_df.set_index('PK', drop=True, inplace=True)
        try:
            ## write to file
            league_standing_file = os.path.join(
                league_standing_dir, country + "_" + league + "_" + season +
                nccprops.fileloc.LEGUE_STADNING_FILE_EXT + ".csv")
            league_df.to_csv(league_standing_file, mode='w+')

        except Exception:
            log.exception("Exception occurred while writing standings to db")
            exit()

    except Exception:
        log.exception(
            "Exception occurred while scrapping data from flash score")

    log.debug("Exit")
Ejemplo n.º 8
0
def scrape_fixture_from_fs(fl, html_source, match_date, fixture_file):
    log.debug("Enter")

    try:
        fixture_df = pd.DataFrame(columns=nccprops.Mandatory_Match_Features)
        fixture_list = []

        #with open (r'E:\OneDrive\MyCode\ncc\bin\FlashScore.com.html') as html_source:
        soup = bs(html_source, 'lxml')

        fs_live_table = soup.find('div', id='live-table')

        fs_soccer_matches = fs_live_table.find('div', class_='sportName')

        if (fs_soccer_matches['class'][1] == 'soccer'):
            Country = ''
            League = ''
            for child in fs_soccer_matches.children:

                if (child['class'][0] == 'event__header'):
                    Country = child.find(
                        'span',
                        class_='event__title--type').text.strip().upper()
                    League = child.find(
                        'span', class_='event__title--name').text.strip()

                if (child['class'][0] == 'event__match'):
                    teams = child.find_all('div', class_='event__participant')
                    odds = child.find_all('div', class_='odds__odd')
                    match_time = child.find('div', class_='event__time')

                    if (match_time is None):
                        ## this match dont have a time. So this is posisbly in progress/completed/postponed etc. skip this match
                        continue
                    else:
                        match_time = formatterutil.format_time(
                            match_time.contents[0].strip(), 'fs')

                    if (len(odds) > 0):
                        if (odds[0].text.strip()
                                not in ('', 'null', '-', None)):
                            try:
                                B365H = float(odds[0].text.strip())
                                B365D = float(odds[1].text.strip())
                                B365A = float(odds[2].text.strip())
                            except:
                                B365H = 0
                                B365D = 0
                                B365A = 0
                        else:
                            B365H = 0
                            B365D = 0
                            B365A = 0
                    else:
                        B365H = 0
                        B365D = 0
                        B365A = 0

                    if (len(teams) > 0):
                        HomeTeam = formatterutil.encode_string_to_utf(
                            teams[0].text)
                        AwayTeam = formatterutil.encode_string_to_utf(
                            teams[1].text)
                        HomeTeam_Alias = nccprops.get_fs_team_alias(
                            Country, HomeTeam.strip())
                        AwayTeam_Alias = nccprops.get_fs_team_alias(
                            Country, AwayTeam.strip())

                        ## feature from ncprops.py
                        ## Mandatory_Match_Features = ['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'B365H', 'B365D', 'B365A']

                        ## now do a mapping for the division
                        try:
                            League = League.replace('/', '-')
                            League = formatterutil.encode_string_to_utf(League)
                            Division = nccprops.fs_country_league_to_div_map[(
                                Country, League)]
                        except Exception:
                            log.debug(
                                "Mapping information not found in fs_country_league_to_div_map for league %s from country %s"
                                % (League, Country))
                            Division = League

                        Season = str(nccprops.get_current_season(Country))

                        fixture_dic = {'Country':Country, 'League':League, 'Div':Division, 'Season':Season, 'Date':match_date, 'Time':match_time, 'HomeTeam':HomeTeam, \
                            'HomeTeam_Alias':HomeTeam_Alias, 'AwayTeam':AwayTeam, 'AwayTeam_Alias':AwayTeam_Alias, 'FTHG':0, 'FTAG':0, 'FTR':'Not Available', 'B365H':B365H,\
                                       'B365D':B365D, 'B365A':B365A, 'FS_League':League}
                        fixture_list.append(fixture_dic)

            fixture_df = pd.DataFrame(fixture_list)
            ## normalise and encode to utf
            fixture_df['Country'] = formatterutil.encode_numpy_to_utf(
                fixture_df['Country'])
            fixture_df['StatSource'] = 'fs_fixture'
            ## now add an extra column
            fixture_df['fixture'] = 1

            try:
                ## write to fixture file
                if (os.path.isfile(fixture_file)):
                    fixture_df.to_csv(fixture_file, mode='a', header=False)
                else:
                    fixture_df.to_csv(fixture_file)

            except PermissionError:
                log.warning("PermissionError: Couldnt write to csv file.")
                return
            except Exception:
                log.warning(
                    "Error occurred while writing to fixture csv file.")
                return

    except Exception:
        log.exception(
            "Exception occurred while scrapping data from flash score")

    log.debug("Exit")