def load_fixture_from_fs(fl, forecast_days): log.debug("Enter") chrome_path = fl.get_chrome_driver() driver = webdriver.Chrome(chrome_path) driver.get('https://www.flashscore.com/') random1 = random.randint(1, 21) random2 = random.randint(1, 11) random3 = random.randint(random1, random2 + 5) if random2 >= random1 else random.randint( random2, random1 + 5) fixture_file = os.path.join(fl.get_data_path(), fl.get_fixture_file()) try: tabs = WebDriverWait( driver, 60).until(lambda d: d.find_elements_by_class_name('tabs__text')) for tab in tabs: if (tab.text == 'Odds'): link = tab link.click() time.sleep(random1) calendar_nav = driver.find_elements_by_class_name( 'calendar__direction') today = datetime.date.today() ##we will get the data for the next 7 days if (os.path.isfile(fixture_file)): os.remove(fixture_file) for day in range(forecast_days): for nav in calendar_nav: class_values = nav.get_attribute('class').split(' ') if (class_values[1].strip() == 'calendar__direction--tomorrow' ): nav.click() time.sleep(random2) tabs = WebDriverWait( driver, 53).until(lambda d: d.find_element_by_class_name( 'event__header')) time.sleep(random3) html_source = driver.page_source match_date = today + datetime.timedelta(days=day) scrape_fixture_from_fs(fl, html_source, match_date, fixture_file) except Exception: log.exception("Exception occurred while loading flashscore page") driver.close() exit() driver.close() log.debug("Exit")
def load_latest_matches_from_op(fl): log.debug("Enter") chrome_path = fl.get_chrome_driver() driver = webdriver.Chrome(chrome_path) base_url = 'https://www.oddsportal.com/soccer/' driver.implicitly_wait(60) try: for key, div in nccprops.fs_country_league_to_div_map.items(): if (nccprops.op_country_league_to_div_map_load.get(key, -1) == -1): continue retry = 0 while (retry in (0,1)): try: op_country, op_league = nccprops.op_country_league_to_div_map[key] log.info('Loading [%s] [%s] -- Retry == %s' % (op_country,op_league, retry)) url_to_load = base_url + op_country + "/" + op_league + "/" + 'results/' random1 = random.randint(10, 21) random2 = random.randint(10, 21) random3 = random.randint(random1, random2 + 5) if random2 >= random1 else random.randint(random2, random1 + 5) driver.get(url_to_load) time.sleep(random1) html_source = driver.page_source season = nccprops.get_current_season(key[0]) scrape_latest_matches_from_op(fl, html_source, key[0], key[1].replace('/', '-'), season, div) time.sleep(random3) retry = 2 except Exception: if (retry == 0): retry = 1 else: retry = 2 continue except Exception: log.exception("Exception occurred while loading flashscore page") driver.close() log.debug("Exit")
def scrape_latest_matches_from_op(fl, html_source, country, league, season, div): log.debug("Enter") match_list = list() odds_portal_file_path = fl.get_oddsportal_dir() try: #with open(r'E:\ncc-data\test\Allsvenskan.html') as html_source: soup = bs(html_source, 'lxml') tournament_table = soup.find('table', class_='table-main') body_ = tournament_table.find('tbody') all_matches = body_.find_all('tr') match_date = "" for _class_ in all_matches: # print (_class_) if (_class_['class'][0] == 'center'): match_date = formatterutil.format_date(_class_.text.split('1X2')[0], 'op') if (match_date != ''): if ('deactivate' in _class_['class']): odds_count = 0 FTHG, FTAG = 0, 0 B365H, B365D, B365A = 0, 0, 0 FTR = 'NA' for child in _class_.children: if ('table-time' in child['class']): match_time = formatterutil.format_time(child.text.strip(), 'op') elif('table-participant' in child['class']): try: split_team = child.text.split(' - ') HomeTeam = split_team[0].strip() AwayTeam = split_team[1].strip() except: break elif ('odds-nowrp' in child['class']): try: if (odds_count == 0): odds_count += 1 B365H = round(float(child.text),2) elif (odds_count == 1): odds_count += 1 B365D = round(float(child.text),2) else: odds_count += 1 B365A = round(float(child.text),2) except: continue elif ('table-score' in child['class']): try: split_score = child.text.split(':') FTHG = int(split_score[0]) FTAG = int(split_score[1]) FTR = 'H' if FTHG > FTAG else ('D' if FTHG == FTAG else 'A') except: break match_dic = {'Country': country, 'League': league, 'Div': div, 'Season': season, 'Date': match_date, 'Time': match_time, 'HomeTeam': HomeTeam, 'AwayTeam': AwayTeam, 'FTHG': FTHG, 'FTAG': FTAG, 'FTR': FTR, 'B365H': B365H, 'B365D': B365D, 'B365A': B365A} match_list.append(match_dic) latest_matches_df = pd.DataFrame(match_list) file_path = os.path.join(odds_portal_file_path,country.upper(),str(season),str(div),str(league)) if not (os.path.isdir(file_path)): os.makedirs(file_path) latest_matches_df.to_csv(os.path.join(file_path, 'latest.csv'), mode='w+') except Exception: log.exception("Exception occurred while matches from odds portal") log.debug("Exit")
def load_historical_matches_from_op(fl): log.debug("Enter") odds_portal_file_path = fl.get_oddsportal_dir() chrome_path = fl.get_chrome_driver() driver = webdriver.Chrome(chrome_path) base_url = 'https://www.oddsportal.com/soccer/' driver.implicitly_wait(60) season_list = nccprops.season_list try: for key, div in nccprops.fs_country_league_to_div_map.items(): if (nccprops.op_country_league_to_div_map_load.get(key, -1) == -1): continue retry = 0 country = key[0].upper() league = key[1].replace('/', '-') op_country, op_league = nccprops.op_country_league_to_div_map[key] url_to_load = base_url + op_country + "/" + op_league + "/" + 'results/' ## these random sleep time is to slow things down and introduce random behavior during scraping process random1 = random.randint(11, 19) random2 = random.randint(7, 17) random3 = random.randint(random1, random2 + 5) if random2 >= random1 else random.randint(random2, random1 + 5) driver.get(url_to_load) seasons = driver.find_elements_by_class_name('main-filter')[1] ## findout what date format we have st_date = str(season_list[0]) slash_date = str(int(st_date) - 1) + "/" + st_date try: date_format = 'NA' if (seasons.find_element_by_link_text(slash_date)): date_format = 'slash' else: date_format = 'NA' except: try: if (seasons.find_element_by_link_text(st_date)): date_format = 'st' else: date_format = 'NA' except: log.error('Failed identifying date format for [%s] [%s]' % (op_country, op_league)) continue log.info('Loading [%s] [%s] Date Format is [%s] -- Retry == %s' % (op_country, op_league, date_format, retry)) for season in season_list: file_path = os.path.join(odds_portal_file_path,country,str(season),str(div),league) if (date_format == 'slash'): page_season = str(int(season) - 1) + "/" + season elif (date_format == 'st'): page_season = str(season) elif (date_format == 'NA'): log.error('Invalid date format for [%s] [%s]' % (op_country, op_league)) break try: seasons = driver.find_elements_by_class_name('main-filter')[1] time.sleep(random.randint(7,11)) season_link = seasons.find_element_by_link_text(page_season) season_link.click() except: continue ## create directories if required if not (os.path.isdir(file_path)): os.makedirs(file_path) file_path = os.path.join(file_path,'history.csv') if (os.path.isfile(file_path)): ## remove the file so that we can start all over os.remove(file_path) count = 1 while count < 20: log.info("-->Scraping Season [%s] Page [%s]" % (season, count)) time.sleep(random.randint(13,23)) html_source = driver.page_source scrape_historical_matches_from_op(fl, html_source, file_path, country, league, div, season, count) count += 1 try: pages = driver.find_element_by_id('pagination') time.sleep(random.randint(5,10)) page = pages.find_element_by_link_text(str(count)) time.sleep(random.randint(5,9)) page.click() time.sleep(7) except Exception: break time.sleep(random3) driver.close() exit() except Exception: log.exception("Exception occurred while loading flashscore page") driver.close() log.debug("Exit")
def scrape_historical_matches_from_op(fl, html_source, file_path, country, league, div, season, count): log.debug("Enter") match_list = list() try: soup = bs(html_source, 'lxml') tournament_table = soup.find('table', class_='table-main') body_ = tournament_table.find('tbody') all_matches = body_.find_all('tr') match_date = "" for _class_ in all_matches: if (_class_['class'][0] == 'center'): match_date = formatterutil.format_date(_class_.text.split('1X2')[0], 'op') if (match_date != ''): if ('deactivate' in _class_['class']): odds_count = 0 FTHG, FTAG = 0, 0 B365H, B365D, B365A = 0, 0, 0 FTR = 'NA' for child in _class_.children: if ('table-time' in child['class']): match_time = formatterutil.format_time(child.text.strip(), 'op') elif('table-participant' in child['class']): try: split_team = child.text.split(' - ') HomeTeam = split_team[0].strip() AwayTeam = split_team[1].strip() except: break elif ('odds-nowrp' in child['class']): try: if (odds_count == 0): odds_count += 1 B365H = round(float(child.text),2) elif (odds_count == 1): odds_count += 1 B365D = round(float(child.text),2) else: odds_count += 1 B365A = round(float(child.text),2) except: B365H = 0 B365D = 0 B365A = 0 elif ('table-score' in child['class']): try: split_score = child.text.split(':') FTHG = int(split_score[0]) FTAG = int(split_score[1]) FTR = 'H' if FTHG > FTAG else ('D' if FTHG == FTAG else 'A') except: break match_dic = {'Country': country, 'League': league, 'Div': div, 'Season': season, 'Date': match_date, 'Time': match_time, 'HomeTeam': HomeTeam, 'AwayTeam': AwayTeam, 'FTHG': FTHG, 'FTAG': FTAG, 'FTR': FTR, 'B365H': B365H, 'B365D': B365D, 'B365A': B365A} match_list.append(match_dic) history_matches_df = pd.DataFrame(match_list) if (count == 1): history_matches_df.to_csv(file_path, mode='w+') else: history_matches_df.to_csv(file_path, mode='a+', header = False) except Exception: log.exception("Exception occurred while loading matches from odds portal for country %s and league %s" %(country, league)) exit() log.debug("Exit")
def load_league_standing_from_fs(fl, number_of_seasons): log.debug("Enter") chrome_path = fl.get_chrome_driver() driver = webdriver.Chrome(chrome_path) driver.implicitly_wait(120) fs_country_leagues = nccprops.fs_country_league_to_div_map.keys() count = 0 try: for country, league in fs_country_leagues: random1 = random.randint(1, 21) random2 = random.randint(1, 11) random3 = random.randint( random1, random2 + 5) if random2 >= random1 else random.randint( random2, random1 + 5) time.sleep(random1) league = league.replace('/', '-') ## create the fs url fs_league = league.replace('.', '-') fs_league = fs_league.replace(' -', '-').replace('- ', '-') fs_league = fs_league.replace(' ', '-').lower() fs_country = country.strip().replace(' ', '-') country_league_url = 'https://www.flashscore.com/football/' + fs_country.lower( ) + "/" + fs_league + "/" driver.get(country_league_url) seasons_count = 0 archive_clicked = False log.debug("Loading standing for Country %s League %s - URL %s" % (country, league, country_league_url)) for i in range(number_of_seasons * 2): if (seasons_count > number_of_seasons): break if not archive_clicked: archive_clicked = True time.sleep(random.randint(1, 5)) archive = driver.find_element_by_id('li4') archive.click() seasons = driver.find_elements_by_class_name( 'leagueTable__seasonName') time.sleep(random2) try: if i == 0: seasons[i].click() seasons_count += 1 archive_clicked = False else: parent_element = seasons[i].find_element_by_xpath('..') parent_name = parent_element.get_attribute( 'class').split(' ') if (parent_name[0] == 'leagueTable__season'): ## incrementing by one since there are two elements by same class and we want every second one seasons[i].click() seasons_count += 1 archive_clicked = False else: continue except Exception: break time.sleep(random3) html_source = driver.page_source #print (html_source) scrape_league_standing_from_fs(fl, html_source, country, league) except Exception: log.exception("Exception occurred. Exiting...") driver.close() exit() driver.close() log.debug("Exit")
def scrape_league_standing_from_fs(fl, html_source, country, league): log.debug("Enter") league_standing_dir = fl.get_league_standing_dir() try: league_list = list() #with open (r'E:\ncc-data\test\FlashScore_summary.html') as html_source: soup = bs(html_source, 'lxml') season_element = soup.find('div', class_='teamHeader__text') season = season_element.text ## format this season season = season.replace(" ", "").strip() div = nccprops.fs_country_league_to_div_map[(country, league)] if (nccprops.SEASON_MAP.get(season, -1) == -1): ## no need for mapping or this format is not present. for the timebeing let proceed as if this is a good one pass else: season = nccprops.SEASON_MAP[season] fs_league_table = soup.find_all('div', class_='row___S6WkQ8-') log.debug( "Scraping standing for Country %s League (formatted) %s Div %s Season %s" % (country, league, div, season)) for team in fs_league_table: try: team_stats = list() for i, stat in enumerate(team.children): try: ## this is for the scored and conceded goals if (i == 6): goal_stat = stat.text.split(':') team_stats.append(goal_stat[0]) team_stats.append(goal_stat[1]) elif (i == 0): rank = stat.text.split('.') team_stats.append(rank[0]) elif (i == 1): team = stat.text.strip() team_alias = nccprops.get_fs_team_alias( country.upper(), team) team_stats.append(team) team_stats.append(team_alias) else: team_stats.append(stat.text.strip()) except Exception: continue league_list.append(team_stats) except Exception: continue log.debug("Loading league %s" % league_list) league = league.replace('/', '-') league_df = pd.DataFrame(league_list, columns=[ 'Rank', 'Team', 'Team_Alias', 'MatchesPlayed', 'Win', 'Draw', 'Loss', 'GoalsScored', 'GoalsConceded', 'Points', 'Form' ]) league_df['Country'] = country league_df['League'] = league league_df['Div'] = div league_df['Season'] = season ## note that the PK was created with league originally. formatterutil.remove_special_chars_from_df(league_df) league_df['PK'] = league_df['Country'].astype(str) + "__" + league_df[ 'Div'].astype(str) + "__" + league_df['Season'].astype( str) + "__" + league_df['Team_Alias'].astype(str) ## reset the index league_df.set_index('PK', drop=True, inplace=True) try: ## write to file league_standing_file = os.path.join( league_standing_dir, country + "_" + league + "_" + season + nccprops.fileloc.LEGUE_STADNING_FILE_EXT + ".csv") league_df.to_csv(league_standing_file, mode='w+') except Exception: log.exception("Exception occurred while writing standings to db") exit() except Exception: log.exception( "Exception occurred while scrapping data from flash score") log.debug("Exit")
def scrape_fixture_from_fs(fl, html_source, match_date, fixture_file): log.debug("Enter") try: fixture_df = pd.DataFrame(columns=nccprops.Mandatory_Match_Features) fixture_list = [] #with open (r'E:\OneDrive\MyCode\ncc\bin\FlashScore.com.html') as html_source: soup = bs(html_source, 'lxml') fs_live_table = soup.find('div', id='live-table') fs_soccer_matches = fs_live_table.find('div', class_='sportName') if (fs_soccer_matches['class'][1] == 'soccer'): Country = '' League = '' for child in fs_soccer_matches.children: if (child['class'][0] == 'event__header'): Country = child.find( 'span', class_='event__title--type').text.strip().upper() League = child.find( 'span', class_='event__title--name').text.strip() if (child['class'][0] == 'event__match'): teams = child.find_all('div', class_='event__participant') odds = child.find_all('div', class_='odds__odd') match_time = child.find('div', class_='event__time') if (match_time is None): ## this match dont have a time. So this is posisbly in progress/completed/postponed etc. skip this match continue else: match_time = formatterutil.format_time( match_time.contents[0].strip(), 'fs') if (len(odds) > 0): if (odds[0].text.strip() not in ('', 'null', '-', None)): try: B365H = float(odds[0].text.strip()) B365D = float(odds[1].text.strip()) B365A = float(odds[2].text.strip()) except: B365H = 0 B365D = 0 B365A = 0 else: B365H = 0 B365D = 0 B365A = 0 else: B365H = 0 B365D = 0 B365A = 0 if (len(teams) > 0): HomeTeam = formatterutil.encode_string_to_utf( teams[0].text) AwayTeam = formatterutil.encode_string_to_utf( teams[1].text) HomeTeam_Alias = nccprops.get_fs_team_alias( Country, HomeTeam.strip()) AwayTeam_Alias = nccprops.get_fs_team_alias( Country, AwayTeam.strip()) ## feature from ncprops.py ## Mandatory_Match_Features = ['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'B365H', 'B365D', 'B365A'] ## now do a mapping for the division try: League = League.replace('/', '-') League = formatterutil.encode_string_to_utf(League) Division = nccprops.fs_country_league_to_div_map[( Country, League)] except Exception: log.debug( "Mapping information not found in fs_country_league_to_div_map for league %s from country %s" % (League, Country)) Division = League Season = str(nccprops.get_current_season(Country)) fixture_dic = {'Country':Country, 'League':League, 'Div':Division, 'Season':Season, 'Date':match_date, 'Time':match_time, 'HomeTeam':HomeTeam, \ 'HomeTeam_Alias':HomeTeam_Alias, 'AwayTeam':AwayTeam, 'AwayTeam_Alias':AwayTeam_Alias, 'FTHG':0, 'FTAG':0, 'FTR':'Not Available', 'B365H':B365H,\ 'B365D':B365D, 'B365A':B365A, 'FS_League':League} fixture_list.append(fixture_dic) fixture_df = pd.DataFrame(fixture_list) ## normalise and encode to utf fixture_df['Country'] = formatterutil.encode_numpy_to_utf( fixture_df['Country']) fixture_df['StatSource'] = 'fs_fixture' ## now add an extra column fixture_df['fixture'] = 1 try: ## write to fixture file if (os.path.isfile(fixture_file)): fixture_df.to_csv(fixture_file, mode='a', header=False) else: fixture_df.to_csv(fixture_file) except PermissionError: log.warning("PermissionError: Couldnt write to csv file.") return except Exception: log.warning( "Error occurred while writing to fixture csv file.") return except Exception: log.exception( "Exception occurred while scrapping data from flash score") log.debug("Exit")