def find_players(self): sports247 = Sports247() footballdatabase = FootballDatabase(self.max_eligible_years) sportsreference = SportsReference(self.max_eligible_years) profootballreference = ProFootballReference() players_list = [] for index, year in enumerate(range(self.start_year, self.end_year + 1)): Logger().log("PlayerController: starting player query for " + str(year)) players = sports247.get_player_rankings(year, year, self.ranking_cutoff) players = sportsreference.get_ncaaf_career(players) players = footballdatabase.get_ncaaf_career(players) players = profootballreference.get_draft_info(players) players_df = self.get_players_df(players) if index == 0: players_df.to_csv('Players_temp.csv', index=False) else: players_df.to_csv('Players_temp.csv', mode='a', header=False, index=False) players_list.append(players) Logger().log("PlayerController: finished player query for " + str(year)) players = list(itertools.chain.from_iterable(players_list)) for index, player in enumerate(players): player.id = str(index) try: player.position_side = Player().position_sides[player.position] player.position_group = Player().position_groups[ player.position] except: player.position_side = '' player.position_group = '' return players
def get_player_rankings(self, start_year, end_year, cutoff): query_start_time = time.perf_counter() core_url = 'https://247sports.com/Season/' page_url = '-Football/CompositeRecruitRankings/?ViewPath' \ '=~%2FViews%2FSkyNet%2FPlayerSportRanking%2F_SimpleSetForSeason.ascx&InstitutionGroup' \ '=HighSchool&Page=' suffix_url = '&_=1576263470372' responses = self.get_request_responses(start_year, end_year, cutoff, core_url, page_url, suffix_url) response_chunks, num_chunks = ChunkGenerator( ).generate_response_chunks(responses) pool = Pool(processes=num_chunks) players = pool.map(self.get_player_chunk, response_chunks) pool.close() players = list(itertools.chain.from_iterable(players)) query_end_time = time.perf_counter() Logger().log_query_time( "Sports247: player rankings query completed in ", query_start_time, query_end_time) return players
def get_request_responses(self, start_year, end_year, cutoff, core_url, page_url, suffix_url): years = list(range(start_year, end_year + 1)) pages = list((range(1, math.ceil(cutoff / 50) + 1))) request_urls = [] for year in years: for page in pages: request_url = core_url \ + str(year) \ + page_url \ + str(page) \ + suffix_url request_urls.append(request_url) try: responses = grequests.map((grequests.get( u, headers=self.request_header, timeout=self.request_timeout) for u in request_urls), size=len(request_urls)) except: Logger().log("Sports247: Unable to query some/all urls for " + ', '.join(map(str, pages)) + " | " + ', '.join(map(str, years))) responses = [ response for response in responses if response is not None ] return responses
def get_coaches(self, coaches_df=None): if coaches_df is None: coaches = self.find_coaches() return coaches coaches_df = coaches_df.reset_index(drop=True) coach_ids = coaches_df['id'].to_list() coaches = [] for coach_id in coach_ids: coach = Coach() coach.id = str(coach_id) coach.name = coaches_df.loc[coaches_df['id'] == coach_id]['name'] coach.name = str(self.get_attribute_value(coach.name)) if coach.name == "": Logger().log( "Coach: more than 1 row found for player id. skipping coach" ) continue coach.team = coaches_df.loc[coaches_df['id'] == coach_id]['team'] coach.team = str(self.get_attribute_value(coach.team)) coach.year = coaches_df.loc[coaches_df['id'] == coach_id]['year'] coach.year = str(self.get_attribute_value(coach.year)) coach.three_star = coaches_df.loc[coaches_df['id'] == coach_id]['three_star'] coach.three_star = str(self.get_attribute_value(coach.three_star)) coach.four_star = coaches_df.loc[coaches_df['id'] == coach_id]['four_star'] coach.four_star = str(self.get_attribute_value(coach.four_star)) coach.five_star = coaches_df.loc[coaches_df['id'] == coach_id]['five_star'] coach.five_star = str(self.get_attribute_value(coach.five_star)) coach.avg = coaches_df.loc[coaches_df['id'] == coach_id]['avg'] coach.avg = str(self.get_attribute_value(coach.avg)) coach.points = coaches_df.loc[coaches_df['id'] == coach_id]['points'] coach.points = str(self.get_attribute_value(coach.points)) coach.nat_rank = coaches_df.loc[coaches_df['id'] == coach_id]['nat_rank'] coach.nat_rank = str(self.get_attribute_value(coach.nat_rank)) coach.drafted = coaches_df.loc[coaches_df['id'] == coach_id]['drafted'] coach.drafted = str(self.get_attribute_value(coach.drafted)) coaches.append(coach) return coaches
def get_teams(self, teams_df=None): if teams_df is None: teams = self.find_teams() return teams teams_df = teams_df.reset_index(drop=True) team_ids = teams_df['id'].to_list() players = [] for team_id in team_ids: team = Team() team.id = str(team_id) team.name = teams_df.loc[teams_df['id'] == team_id]['name'] team.name = str(self.get_attribute_value(team.name)) if team.name == "": Logger().log( "Player: more than 1 row found for player id. skipping player" ) continue team.conference = teams_df.loc[teams_df['id'] == team_id]['conference'] team.conference = str(self.get_attribute_value(team.conference)) team.commits = teams_df.loc[teams_df['id'] == team_id]['commits'] team.commits = str(self.get_attribute_value(team.commits)) team.year = teams_df.loc[teams_df['id'] == team_id]['year'] team.year = str(self.get_attribute_value(team.year)) team.nat_rank = teams_df.loc[teams_df['id'] == team_id]['nat_rank'] team.nat_rank = str(self.get_attribute_value(team.nat_rank)) team.five_star = teams_df.loc[teams_df['id'] == team_id]['five_star'] team.five_star = str(self.get_attribute_value(team.five_star)) team.four_star = teams_df.loc[teams_df['id'] == team_id]['four_star'] team.four_star = str(self.get_attribute_value(team.four_star)) team.three_star = teams_df.loc[teams_df['id'] == team_id]['three_star'] team.three_star = str(self.get_attribute_value(team.three_star)) team.avg = teams_df.loc[teams_df['id'] == team_id]['avg'] team.avg = str(self.get_attribute_value(team.avg)) team.points = teams_df.loc[teams_df['id'] == team_id]['points'] team.points = str(self.get_attribute_value(team.points)) players.append(team) return players
def update_players(self, players): Logger().log("PlayerController: starting update player query") footballdatabase = FootballDatabase(self.max_eligible_years) sportsreference = SportsReference(self.max_eligible_years) profootballreference = ProFootballReference() subset_list = [] for i in range(0, len(players), 500): subset = players[i:i + 500] subset = sportsreference.get_ncaaf_career(subset) subset = footballdatabase.get_ncaaf_career(subset) subset = profootballreference.get_draft_info(subset) subset_list.append(subset) players = list(itertools.chain.from_iterable(subset_list)) Logger().log("PlayerController: finished update player query") return players
def get_ncaaf_career(self, players): query_start_time = time.perf_counter() player_chunks, num_chunks = ChunkGenerator().generate_data_chunks( players) pool = Pool(processes=num_chunks) players = pool.map(self.get_data_chunks, player_chunks) pool.close() players = list(itertools.chain.from_iterable(players)) query_end_time = time.perf_counter() Logger().log_query_time("FootballDB: career query completed in ", query_start_time, query_end_time) return players
def find_teams(self): sports247 = Sports247() teams_list = [] for index, year in enumerate(range(self.start_year, self.end_year + 1)): Logger().log("TeamController: starting team query for " + str(year)) teams = sports247.get_team_rankings(year, year, self.ranking_cutoff) teams_df = self.get_teams_df(teams) if index == 0: teams_df.to_csv('Teams_temp.csv', mode='a', index=False) else: teams_df.to_csv('Teams_temp.csv', mode='a', header=False, index=False) teams_list.append(teams) Logger().log("TeamController: finished team query for " + str(year)) teams = list(itertools.chain.from_iterable(teams_list)) teams = sports247.add_conferences(teams) for index, team in enumerate(teams): team.id = str(index) return teams
def get_team_rankings(self, start_year, end_year, cutoff): query_start_time = time.perf_counter() core_url = 'https://247sports.com/Season/' page_url = '-Football/CompositeTeamRankings/?page=' suffix_url = '' responses = self.get_request_responses(start_year, end_year, cutoff, core_url, page_url, suffix_url) response_chunks, num_chunks = ChunkGenerator( ).generate_response_chunks(responses) pool = Pool(processes=num_chunks) teams = pool.map(self.get_team_chunk, response_chunks) pool.close() teams = list(itertools.chain.from_iterable(teams)) query_end_time = time.perf_counter() Logger().log_query_time("Sports247: team rankings query completed in ", query_start_time, query_end_time) return teams
def find_player_url(self, player, team, enrolled_year): curr_year = int(date.today().year) eligible_years = list( range(enrolled_year, enrolled_year + self.max_eligible_years)) eligible_years = [x for x in eligible_years if x <= curr_year] request_urls = [] for eligible_year in eligible_years: request_url = 'https://www.footballdb.com/college-football/teams/fbs/' \ + str(team) \ + '/roster/' \ + str(eligible_year) request_urls.append(request_url) try: responses = grequests.map((grequests.get( u, headers=self.request_header, timeout=self.request_timeout) for u in request_urls), size=len(request_urls)) except: Logger().log("FootballDB: Unable to query some/all urls for " + player.name + " | " + team + " | " + ', '.join(map(str, eligible_years)) + ', '.join(map(str, request_urls))) player_url = '' return player_url responses = [ response for response in responses if response is not None ] for response in responses: page_html = response.text page_soup = soup(page_html, "lxml") if page_soup is None: Logger().log("FootballDB: " + "no html page returned") continue roster_list = page_soup.find( "div", {"class": "divtable divtable-striped divtable-mobile"}) roster_list = roster_list.findAll( "a", href=re.compile( r"/college-football/players/")) if roster_list else [] for roster_item in roster_list: roster_player = roster_item.get_text() if roster_item else "" roster_player = " ".join(roster_player.split(",")[::-1]) roster_player = Player().trim_name(roster_player) player_of_interest = Player().trim_name(player.name) if player_of_interest == roster_player: player_url = 'https://www.footballdb.com' + roster_item[ 'href'] return player_url player_url = '' return player_url
def find_player_url(self, player, team, enrolled_year): # lets search all years the player could have been eligible curr_year = int(date.today().year) eligible_years = list( range(enrolled_year, enrolled_year + self.max_eligible_years)) eligible_years = [x for x in eligible_years if x <= curr_year] request_urls = [] for eligible_year in eligible_years: request_url = 'https://www.sports-reference.com/cfb/schools/' \ + str(team) \ + '/' \ + str(eligible_year) \ + '-roster.html' request_urls.append(request_url) try: responses = grequests.map((grequests.get( u, headers=self.request_header, timeout=self.request_timeout) for u in request_urls), size=len(request_urls)) except: Logger().log( "SportsReference: Unable to query some/all urls for " + player.name + " | " + team + " | " + ', '.join(map(str, eligible_years)) + ', '.join(map(str, request_urls))) player_url = '' return player_url responses = [ response for response in responses if response is not None ] for response in responses: page_html = response.text page_soup = soup(page_html, "lxml") if page_soup is None: Logger().log("SportsReference: " + "No html page returned for " + team) continue roster_list = page_soup.find("div", {"id": "div_roster"}) roster_list = roster_list.find("tbody") if roster_list else [] roster_list = roster_list.findAll( "a", href=re.compile(r"/cfb/players/")) if roster_list else [] for roster_item in roster_list: roster_player = Player().trim_name(roster_item.get_text()) player_of_interest = Player().trim_name(player.name) if player_of_interest == roster_player: player_url = 'https://www.sports-reference.com' + roster_item[ 'href'] return player_url player_url = '' return player_url
def add_stats(self, player, player_url): if not player_url: Logger().log("FootballDB: " + "Unable to find " + player.name + " | " + player.team + " | " + player.enrolled + " | " + player_url) return player try: response = requests.get(url=player_url, headers=self.request_header, timeout=self.request_timeout) except: Logger().log("FootballDB: Unable to get web query for " + player.name + " | " + player_url) return player if response is None: Logger().log("FootballDB: " + "No query response for " + player.name + " | " + player_url) return player page_html = response.text page_soup = soup(page_html, "lxml") if page_soup is None: Logger().log("FootballDB: " + "No html page returned for " + player.name + " | " + player_url) return player stats_list = page_soup.find("table", {"class": "statistics scrollable"}) stats_list = stats_list.find("tbody") if stats_list else [] stats_list = stats_list.findAll( "tr", re.compile(r"row")) if stats_list else [] if not stats_list: Logger().log("FootballDB: " + "No stats found for " + player.name + " | " + player_url) return player enrolled_years = [] teams = [] for stats_item in stats_list: playing_year = stats_item.find("td", {"class": "center"}) playing_year = playing_year.get_text() if playing_year else "" team = stats_item.find( "a", href=re.compile(r"/college-football/teams/fbs/")) team = team['href'] if team else "" team = team.replace("/college-football/teams/fbs/", "") team = Team().get_global_name(team, 'FootballDB') if not team: team = stats_item.find("span", {"class": "hidden-xs"}) team = team.get_text() if team else "" team = re.sub("\(.*\)|\s-\s.*", "", team).strip() enrolled_years.append(playing_year) teams.append(team) if not all(x == teams[0] for x in teams): player.transferred = self.trim(teams[-1]) player.ncaaf_years = self.trim(str(len(enrolled_years))) last_enrolled_year = int(enrolled_years[-1]) curr_year = int(date.today().year) if last_enrolled_year < curr_year: player.ncaaf_status = "exhausted" else: player.ncaaf_status = "active" return player
def add_stats(self, player, player_url): if not player_url: Logger().log("SportsReference: " + "Unable to find " + player.name + " | " + player.team + " | " + player.enrolled + " | " + player_url) return player try: response = requests.get(url=player_url, headers=self.request_header, timeout=self.request_timeout) except: Logger().log("SportsReference: Unable to get web query for " + player.name + " | " + player_url) return player if response is None: Logger().log("SportsReference: " + "No query response for " + player.name + " | " + player_url) return player page_html = response.text page_soup = soup(page_html, "lxml") if page_soup is None: Logger().log("SportsReference: " + "No html page returned for " + player.name + " | " + player_url) return player stats_list = page_soup.find("div", {"id": "content"}) stats_list = stats_list.find("tbody") if stats_list else [] stats_list = stats_list.findAll("tr") if stats_list else [] if not stats_list: Logger().log("SportsReference: " + "No stats found for " + player.name + " | " + player_url) return player enrolled_years = [] teams = [] for stats_item in stats_list: playing_year = stats_item.find("a", href=re.compile(r"/cfb/years")) playing_year = playing_year.get_text() if playing_year else "" team = stats_item.find("a", href=re.compile(r"/cfb/schools/")) team = team['href'].split('/')[3] if team else "" team = Team().get_global_name(team, 'SportsReference') enrolled_years.append(playing_year) teams.append(team) if not all(x == teams[0] for x in teams): player.transferred = self.trim(teams[-1]) player.ncaaf_years = self.trim(str(len(enrolled_years))) last_enrolled_year = int(enrolled_years[-1]) curr_year = int(date.today().year) if last_enrolled_year < curr_year: player.ncaaf_status = "exhausted" else: player.ncaaf_status = "active" return player
def add_conferences(self, teams): teams_url = 'https://247sports.com/Season/2020-Football/CompositeTeamRankings/' try: response = requests.get(url=teams_url, headers=self.request_header, timeout=self.request_timeout) except: Logger().log("Sports247: Unable to get web query for teams") return teams page_html = response.text page_soup = soup(page_html, "lxml") if page_soup is None: Logger().log("Sports247: " + "No html page returned") return teams conferences_list = page_soup.find( "ul", {"class": "rankings-page__conference-list"}) conferences_list = conferences_list.findAll( "a", href=re.compile(r"Conference")) if conferences_list else [] conference_dictionary = {} for conferences_item in conferences_list: conference = conferences_item.get_text() conference_url = 'https://247sports.com' + conferences_item['href'] conference_dictionary[conference] = conference_url for conference, conference_url in conference_dictionary.items(): try: response = requests.get(url=conference_url, headers=self.request_header, timeout=self.request_timeout) except: Logger().log( "Sports247: Unable to get web query for conference") continue page_html = response.text page_soup = soup(page_html, "lxml") if page_soup is None: Logger().log("Sports247: " + "No html page returned") continue rankings_list = page_soup.findAll( "li", {"class": "rankings-page__list-item"}) for team in teams: for rankings_entry in rankings_list: conference_team = rankings_entry.div.find( "div", {"class": "team"}) conference_team = self.trim( conference_team.get_text()) if conference_team else "" if team.name == conference_team: team.conference = self.trim(conference) break return teams
def get_team_chunk(self, responses_chunk): teams = [] for response in responses_chunk: page_html = response.text page_soup = soup(page_html, "lxml") if page_soup is None: Logger().log("Sports247: " + "No html page returned") continue year = page_soup.find("title", text=re.compile("Football Team Rankings")) year = year.get_text().strip().split(' ')[0] if year else "" rankings_list = page_soup.findAll( "li", {"class": "rankings-page__list-item"}) for rankings_entry in rankings_list: team = Team() team.name = rankings_entry.div.find("div", {"class": "team"}) team.name = self.trim( team.name.get_text()) if team.name else "" team.nat_rank = rankings_entry.div.find( "div", {"class": "primary"}) team.nat_rank = self.trim( team.nat_rank.get_text()) if team.nat_rank else "" team.avg = rankings_entry.div.find("div", {"class": "avg"}) team.avg = self.trim(team.avg.get_text()) if team.avg else "" team.points = rankings_entry.div.find("div", {"class": "points"}) team.points = self.trim( team.points.get_text()) if team.points else "" team.commits = rankings_entry.div.find("div", {"class": "total"}) team.commits = team.commits.get_text() if team.commits else "" team.commits = self.trim( team.commits.strip().split(' ')[0]) if team.commits else "" stars_list = rankings_entry.div.find( "ul", {"class": "star-commits-list"}) stars_list = stars_list.findAll("li") if stars_list else [] for stars_entry in stars_list: if stars_entry.find("h2", string="3-Star"): team.three_star = stars_entry.find("div") team.three_star = self.trim(team.three_star.get_text( )) if team.three_star else "" if stars_entry.find("h2", string="4-Star"): team.four_star = stars_entry.find("div") team.four_star = self.trim(team.four_star.get_text() ) if team.four_star else "" if stars_entry.find("h2", string="5-Star"): team.five_star = stars_entry.find("div") team.five_star = self.trim(team.five_star.get_text() ) if team.five_star else "" team.year = self.trim(year) teams.append(team) return teams
def add_stats(self, player, team): team_url = 'https://www.pro-football-reference.com/schools/' + team + '/drafted.htm' try: response = requests.get(url=team_url, headers=self.request_header, timeout=self.request_timeout) except: Logger().log("ProFootballReference: Unable to query " + team + " for " + player.name + " | " + team_url) return player if response is None: Logger().log("ProFootballReference: No query response for " + team + " for " + player.name + " | " + team_url) return player page_html = response.text page_soup = soup(page_html, "lxml") if page_soup is None: Logger().log("ProFootballReference: " + "No html page returned for " + team_url) return player drafted_list = page_soup.find("tbody") drafted_list = drafted_list.findAll("tr") if drafted_list else [] if not drafted_list: Logger().log("ProFootballReference: " + "No draft list for " + player.team) return player for drafted_item in drafted_list: drafted_player = drafted_item.find("td", { "data-stat": "player" }).find("a") drafted_player = drafted_player.get_text( ) if drafted_player else "" drafted_player = Player().trim_name(drafted_player) drafted_year = drafted_item.find("td", { "data-stat": "year_id" }).find("a") drafted_year = drafted_year.get_text() if drafted_year else "" drafted_year = int(drafted_year) player_of_interest = Player().trim_name(player.name) enrolled_year = int(float(player.enrolled)) if (player_of_interest == drafted_player) and (drafted_year >= enrolled_year): stats = [] stat_list = drafted_item.findAll("td") stat_list = stat_list if stat_list else [] for stat in stat_list: stats.append(stat.get_text()) stats = stats[:-1] player.drafted = self.trim(stats[0]) player.round = self.trim(stats[2]) player.pick = self.trim(stats[3]) player.nfl_position = self.trim(stats[6]) return player
def get_player_chunk(self, responses_chunk): players = [] for response in responses_chunk: page_html = response.text page_soup = soup(page_html, "lxml") if page_soup is None: Logger().log("Sports247: " + "No html page returned") continue enrolled = page_soup.find("title", text=re.compile("Top Football Recruits")) enrolled = enrolled.get_text().strip().split( ' ')[0] if enrolled else "" rankings_list = page_soup.findAll( lambda tag: tag.name == 'li' and tag.get( 'class') == ['rankings-page__list-item']) for rankings_entry in rankings_list: player = Player() player.name = rankings_entry.find( "a", {"class": "rankings-page__name-link"}) player.name = self.trim( player.name.get_text()) if player.name else "" player.position = rankings_entry.find("div", {"class": "position"}) player.position = self.trim( player.position.get_text()) if player.position else "" player.metrics = rankings_entry.find("div", {"class": "metrics"}) player.metrics = self.trim( player.metrics.get_text()) if player.metrics else "" player.overall_rank = rankings_entry.find( "div", {"class": "primary"}) player.overall_rank = self.trim(player.overall_rank.get_text( )) if player.overall_rank else "" player.pos_rank = rankings_entry.find("a", {"class": "posrank"}) player.pos_rank = self.trim( player.pos_rank.get_text()) if player.pos_rank else "" player.state_rank = rankings_entry.find( "a", {"class": "sttrank"}) player.state_rank = self.trim( player.state_rank.get_text()) if player.state_rank else "" player.nat_rank = rankings_entry.find("a", {"class": "natrank"}) player.nat_rank = self.trim( player.nat_rank.get_text()) if player.nat_rank else "" player.stars = rankings_entry.findAll( "span", {"class": "icon-starsolid yellow"}) player.stars = self.trim(str(len( player.stars))) if player.stars else "" player.score = rankings_entry.find("span", {"class": "score"}) player.score = self.trim( player.score.get_text()) if player.score else "" player.team = rankings_entry.find("div", { "class": "status" }).find("img", title=True) player.team = Team().get_global_name( player.team['title'], 'Sports247') if player.team else "" player.team = self.trim(player.team) player.enrolled = self.trim(enrolled) players.append(player) return players