def dfFromGameLogURL(url): """ Takes a url of a player's game log for a given year, returns a DataFrame """ glsoup = getSoupFromURL(url) reg_season_table = glsoup.findAll('table', attrs={'id': 'pgl_basic'}) # id for reg season table playoff_table = glsoup.findAll('table', attrs={'id': 'pgl_basic_playoffs'}) # id for playoff table # parse the table header. we'll use this for the creation of the DataFrame header = [] for th in reg_season_table[0].findAll('th'): if not th.getText() in header: header.append(th.getText()) # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly header[5] = u'HomeAway' header.insert(7, u'WinLoss') reg = soupTableToDF(reg_season_table, header) playoff = soupTableToDF(playoff_table, header) if reg is None: return playoff elif playoff is None: return reg else: return pd.concat([reg, playoff])
def dfFromGameLogURL(url): """ Takes a url of a player's game log for a given year, returns a DataFrame """ glsoup = getSoupFromURL(url) reg_season_table = glsoup.findAll('table', attrs={'id': 'pgl_basic' }) # id for reg season table playoff_table = glsoup.findAll('table', attrs={'id': 'pgl_basic_playoffs' }) # id for playoff table # parse the table header. we'll use this for the creation of the DataFrame header = [] for th in reg_season_table[0].findAll('th'): if not th.getText() in header: header.append(th.getText()) # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly header[5] = u'HomeAway' header.insert(7, u'WinLoss') reg = soupTableToDF(reg_season_table, header) playoff = soupTableToDF(playoff_table, header) if reg is None: return playoff elif playoff is None: return reg else: return pd.concat([reg, playoff])
def getAllPlayers(suppressOutput=True, min_year_active=2004): players = dict() for letter in string.ascii_lowercase: letter_page = getSoupFromURL('https://www.basketball-reference.com/players/{}/'.format(letter), suppressOutput) if letter_page is None: continue all_rows = letter_page.find("table", id="players").find("tbody").find_all("tr") for row in all_rows: player = row.find("th", attrs={"data-stat": "player", "scope": "row"}) if player is None: continue player = player.find("a") name = player.get_text() last_year_active_soup = row.find("td", attrs={"data-stat": "year_max"}) last_year_active = int(last_year_active_soup.get_text()) try: if last_year_active >= min_year_active: players[name] = Player(name, 'https://www.basketball-reference.com' + player.attrs['href']) except Exception as e: print("ERROR:", e) sleep(1) # sleeping to be kind for requests return players
def playoff_dfFromGameLogURL(url): glsoup = getSoupFromURL(url) playoff_table = find_playoff_table(glsoup) header = [] if len(playoff_table) > 0 and playoff_table[0] is not None: table_header = playoff_table[0].find("thead") else: print("Not found playoff season dataframes") return None for th in table_header.find_all('th'): # if not th.getText() in header: header.append(th.getText()) header.insert(5, 'HomeAway') header.insert(8, 'WinLoss') header.pop(0) header.remove(u'\xa0') header.remove(u'\xa0') playoff = soupTableToDF(playoff_table, header) if playoff is None: print("Not found playoff season dataframes") return None else: try: return playoff except Exception as e: print("ERROR - Couldn't merge dataframes:", e) print(playoff) return None
def pbp_dfFromGameLogURL(url): glsoup = getSoupFromURL(url) pbp_table = find_pbp_table(glsoup) header = [] x = 0 if len(pbp_table) > 0 and pbp_table[0] is not None: table_header = pbp_table[0].find("thead") else: print("Not found playoff season dataframes") return None for th in table_header.find_all('th'): if x > 7: header.append(th.getText()) x += 1 # the above for loop is aiming to delete overhead of pbp table # print(header) pbp = soupTableToDF(pbp_table, header) if pbp is None: print("Not found pbp dataframes") return None else: try: return pbp except Exception as e: print("ERROR - Couldn't merge dataframes:", e) print(pbp) return None
def getCurrentTeams(suppressOutput=True): teams = dict() glsoup = getSoupFromURL('https://www.basketball-reference.com/teams/', suppressOutput) active_teams_table = glsoup.find( 'table', id='teams_active') # id for reg season table all_rows = active_teams_table.find_all("th", attrs={"data-stat": "franch_name"}) active_teams = list() for row in all_rows: team = row.find("a") if team is None: continue active_teams.append(team) for team in active_teams: name = team.get_text() try: teams[name] = Team( name, 'https://www.basketball-reference.com' + team.attrs['href']) except Exception as e: print("ERROR:", e) sleep(1) # sleeping to be kind for requests return teams
def getAllPlayerNamesAndURLS(suppressOutput=True): names = [] for letter in string.ascii_lowercase: letter_page = getSoupFromURL( 'https://www.basketball-reference.com/players/{}/'.format(letter), suppressOutput) if letter_page is None: continue all_rows = letter_page.find("table", id="players").find("tbody").find_all("tr") for row in all_rows: player = row.find("th", attrs={ "data-stat": "player", "scope": "row" }) if player is None: continue player = player.find("a") name = player.get_text() try: names.append((name, 'https://www.basketball-reference.com' + player.attrs['href'])) except Exception as e: print("ERROR:", e) sleep(1) # sleeping to be kind for requests return dict(names)
def getoverView(url_tup): print("in get overViews") glsoup = getSoupFromURL(url_tup[1]) id_lst = [ "all_per_game", "all_totals", "all_per_minute", "all_per_poss", "all_advanced", "all_shooting", "all_pbp", "all_playoffs_per_game", "all_playoffs_totals", "all_playoffs_per_minute", "all_playoffs_per_poss", "all_playoffs_advanced", "all_playoffs_shooting", "all_playoffs_pbp", "all_all_salaries" ] final_dict = {} for curr_id in id_lst: curr_div = glsoup.find("div", {"id": curr_id}) if curr_div != None: div = curr_div.find("div", {"class": "overthrow table_container"}) table_header_lst = div.find("thead") th_lst = table_header_lst.find_all("tr") final_th_header = th_lst[-1] header_lst = [] th_stuff = final_th_header.find_all("th") for th_thing in th_stuff: curr_val = th_thing.get_text() header_lst.append(curr_val) curr_table = getovHelper(div) final_table = curr_table final_table.insert(0, header_lst) final_dict[curr_id] = final_table sleep(2) return (url_tip[0], final_dict)
def getCurrentPlayerNamesAndURLS(suppressOutput=True): names = [] for letter in string.ascii_lowercase: letter_page = getSoupFromURL('http://www.basketball-reference.com/players/%s/' % (letter), suppressOutput) # we know that all the currently active players have <strong> tags, so we'll limit our names to those current_names = letter_page.findAll('strong') for n in current_names: name_data = n.children.next() names.append((name_data.contents[0], 'http://www.basketball-reference.com' + name_data.attrs['href'])) time.sleep(1) # sleeping to be kind for requests return dict(names)
def dfFromGameLogURL(url): """ Takes a url of a player's game log for a given year, returns a DataFrame """ sleep(1) glsoup = getSoupFromURL(url) reg_season_table = glsoup.find_all( 'table', id="pgl_basic") # id for reg season table playoff_table = find_playoff_table(glsoup) # parse the table header. we'll use this for the creation of the DataFrame header = [] if len(reg_season_table) > 0 and reg_season_table[0] is not None: table_header = reg_season_table[0].find("thead") else: print("Error retrieving game log from:") print(url) exit(1) for th in table_header.find_all('th'): # if not th.getText() in header: header.append(th.getText()) # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly header.insert(5, 'HomeAway') header.insert(8, 'WinLoss') header.pop(0) header.remove('\xa0') header.remove('\xa0') reg = soupTableToDF(reg_season_table, header) playoff = soupTableToDF(playoff_table, header) if reg is None: return playoff elif playoff is None: return reg else: try: return pd.concat([reg, playoff]) except Exception as e: print("ERROR - Couldn't merge dataframes:", e) print(reg) print(playoff) return None
def dfFromGameLogURL(url): """ Takes a url of a player's game log for a given year, returns a DataFrame """ glsoup = getSoupFromURL(url) reg_season_table = glsoup.findAll('table', attrs={'id': 'pgl_basic' }) # id for reg season table playoff_table = glsoup.find_all( string=lambda text: isinstance(text, Comment)) try: playoff_table = BeautifulSoup( filter(lambda x: 'pgl_basic_playoffs' in x, playoff_table)[0]) playoff_table = playoff_table.findAll( 'table', attrs={'id': 'pgl_basic_playoffs'}) # id for playoff table except: playoff_table = [] # parse the table header. we'll use this for the creation of the DataFrame header = [] for th in reg_season_table[0].findAll('th'): if not th.getText() in header: try: int(th.getText()) except: header.append(th.getText()) # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly header[5] = u'HomeAway' header.insert(7, u'WinLoss') reg = soupTableToDF(reg_season_table, header) playoff = soupTableToDF(playoff_table, header) if reg is None: return playoff elif playoff is None: return reg else: return pd.concat([reg, playoff])
def getAllCoaches(suppressOutput=True, min_year_active=2004): coaches = dict() glsoup = getSoupFromURL('https://www.basketball-reference.com/coaches/', suppressOutput) all_rows = glsoup.find("table", id="coaches").find("tbody").find_all("tr") for row in all_rows: coach = row.find("th", attrs={"data-stat": "coach", "scope": "row"}) if coach is None: continue coach = coach.find("a") name = coach.get_text() last_year_active_soup = row.find("td", attrs={"data-stat": "year_max"}) last_year_active = int(last_year_active_soup.get_text()) try: if last_year_active >= min_year_active: coaches[name] = Coach(name, 'https://www.basketball-reference.com' + coach.attrs['href']) except Exception as e: print("ERROR:", e) sleep(1) # sleeping to be kind for requests return coaches
def getCurrentPlayerNamesAndURLS(suppressOutput=True): names = [] for letter in string.ascii_lowercase: letter_page = getSoupFromURL('https://www.basketball-reference.com/players/%s/' % (letter), suppressOutput) if letter_page is None: continue # we know that all the currently active players have <strong> tags, so we'll limit our names to those current_names = letter_page.findAll('strong') for n in current_names: name_data = n.children.next() try: names.append((name_data.contents[0], 'https://www.basketball-reference.com' + name_data.attrs['href'])) except Exception as e: pass sleep(1) # sleeping to be kind for requests return dict(names)