Python getSoupFromURL Examples, player.getSoupFromURL Python Examples

Example #1

0

Show file

File: basketballCrawler.py Project: GeorgeSixx/basketballcrawler

def dfFromGameLogURL(url):
    """
    Takes a url of a player's game log for a given year, returns a DataFrame
    """
    glsoup = getSoupFromURL(url)

    reg_season_table = glsoup.findAll('table', attrs={'id': 'pgl_basic'})  # id for reg season table
    playoff_table = glsoup.findAll('table', attrs={'id': 'pgl_basic_playoffs'}) # id for playoff table

    # parse the table header.  we'll use this for the creation of the DataFrame
    header = []
    for th in reg_season_table[0].findAll('th'):
        if not th.getText() in header:
            header.append(th.getText())

    # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly

    header[5] = u'HomeAway'
    header.insert(7, u'WinLoss')

    reg = soupTableToDF(reg_season_table, header)
    playoff = soupTableToDF(playoff_table, header)

    if reg is None:
        return playoff
    elif playoff is None:
        return reg
    else:
        return pd.concat([reg, playoff])

Example #2

0

Show file

def dfFromGameLogURL(url):
    """
    Takes a url of a player's game log for a given year, returns a DataFrame
    """
    glsoup = getSoupFromURL(url)

    reg_season_table = glsoup.findAll('table',
                                      attrs={'id': 'pgl_basic'
                                             })  # id for reg season table
    playoff_table = glsoup.findAll('table', attrs={'id': 'pgl_basic_playoffs'
                                                   })  # id for playoff table

    # parse the table header.  we'll use this for the creation of the DataFrame
    header = []
    for th in reg_season_table[0].findAll('th'):
        if not th.getText() in header:
            header.append(th.getText())

    # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly

    header[5] = u'HomeAway'
    header.insert(7, u'WinLoss')

    reg = soupTableToDF(reg_season_table, header)
    playoff = soupTableToDF(playoff_table, header)

    if reg is None:
        return playoff
    elif playoff is None:
        return reg
    else:
        return pd.concat([reg, playoff])

Example #3

0

Show file

def getAllPlayers(suppressOutput=True, min_year_active=2004):

    players = dict()

    for letter in string.ascii_lowercase:
        letter_page = getSoupFromURL('https://www.basketball-reference.com/players/{}/'.format(letter), suppressOutput)
        if letter_page is None:
            continue
        all_rows = letter_page.find("table", id="players").find("tbody").find_all("tr")
        for row in all_rows:
            player = row.find("th", attrs={"data-stat": "player", "scope": "row"})
            if player is None:
                continue
            player = player.find("a")
            name = player.get_text()
            last_year_active_soup = row.find("td", attrs={"data-stat": "year_max"})
            last_year_active = int(last_year_active_soup.get_text())
            try:
                if last_year_active >= min_year_active:
                    players[name] = Player(name, 'https://www.basketball-reference.com' + player.attrs['href'])
            except Exception as e:
                print("ERROR:", e)
        sleep(1) # sleeping to be kind for requests

    return players

Example #4

0

Show file

def playoff_dfFromGameLogURL(url):
    glsoup = getSoupFromURL(url)
    playoff_table = find_playoff_table(glsoup)

    header = []
    if len(playoff_table) > 0 and playoff_table[0] is not None:
        table_header = playoff_table[0].find("thead")
    else:
        print("Not found playoff season dataframes")
        return None
    for th in table_header.find_all('th'):
        # if not th.getText() in header:
        header.append(th.getText())

    header.insert(5, 'HomeAway')
    header.insert(8, 'WinLoss')
    header.pop(0)
    header.remove(u'\xa0')
    header.remove(u'\xa0')

    playoff = soupTableToDF(playoff_table, header)

    if playoff is None:
        print("Not found playoff season dataframes")
        return None
    else:
        try:
            return playoff
        except Exception as e:
            print("ERROR - Couldn't merge dataframes:", e)
            print(playoff)
            return None

Example #5

0

Show file

def pbp_dfFromGameLogURL(url):
    glsoup = getSoupFromURL(url)
    pbp_table = find_pbp_table(glsoup)
    header = []
    x = 0
    if len(pbp_table) > 0 and pbp_table[0] is not None:
        table_header = pbp_table[0].find("thead")
    else:
        print("Not found playoff season dataframes")
        return None
    for th in table_header.find_all('th'):
        if x > 7:
            header.append(th.getText())
        x += 1
    # the above for loop is aiming to delete overhead of pbp table
    # print(header)

    pbp = soupTableToDF(pbp_table, header)

    if pbp is None:
        print("Not found pbp dataframes")
        return None
    else:
        try:
            return pbp
        except Exception as e:
            print("ERROR - Couldn't merge dataframes:", e)
            print(pbp)
            return None

Example #6

0

Show file

File: basketballCrawler.py Project: nhbala/nba_pred

def getCurrentTeams(suppressOutput=True):

    teams = dict()
    glsoup = getSoupFromURL('https://www.basketball-reference.com/teams/',
                            suppressOutput)

    active_teams_table = glsoup.find(
        'table', id='teams_active')  # id for reg season table
    all_rows = active_teams_table.find_all("th",
                                           attrs={"data-stat": "franch_name"})
    active_teams = list()
    for row in all_rows:
        team = row.find("a")
        if team is None:
            continue
        active_teams.append(team)
    for team in active_teams:
        name = team.get_text()
        try:
            teams[name] = Team(
                name,
                'https://www.basketball-reference.com' + team.attrs['href'])
        except Exception as e:
            print("ERROR:", e)
    sleep(1)  # sleeping to be kind for requests

    return teams

Example #7

0

Show file

File: basketballCrawler.py Project: nhbala/nba_pred

def getAllPlayerNamesAndURLS(suppressOutput=True):

    names = []

    for letter in string.ascii_lowercase:
        letter_page = getSoupFromURL(
            'https://www.basketball-reference.com/players/{}/'.format(letter),
            suppressOutput)
        if letter_page is None:
            continue
        all_rows = letter_page.find("table",
                                    id="players").find("tbody").find_all("tr")
        for row in all_rows:
            player = row.find("th",
                              attrs={
                                  "data-stat": "player",
                                  "scope": "row"
                              })
            if player is None:
                continue
            player = player.find("a")
            name = player.get_text()
            try:
                names.append((name, 'https://www.basketball-reference.com' +
                              player.attrs['href']))
            except Exception as e:
                print("ERROR:", e)
        sleep(1)  # sleeping to be kind for requests

    return dict(names)

Example #8

0

Show file

File: basketballCrawler.py Project: nhbala/nba_pred

def getoverView(url_tup):
    print("in get overViews")

    glsoup = getSoupFromURL(url_tup[1])

    id_lst = [
        "all_per_game", "all_totals", "all_per_minute", "all_per_poss",
        "all_advanced", "all_shooting", "all_pbp", "all_playoffs_per_game",
        "all_playoffs_totals", "all_playoffs_per_minute",
        "all_playoffs_per_poss", "all_playoffs_advanced",
        "all_playoffs_shooting", "all_playoffs_pbp", "all_all_salaries"
    ]
    final_dict = {}
    for curr_id in id_lst:
        curr_div = glsoup.find("div", {"id": curr_id})
        if curr_div != None:
            div = curr_div.find("div", {"class": "overthrow table_container"})
            table_header_lst = div.find("thead")
            th_lst = table_header_lst.find_all("tr")
            final_th_header = th_lst[-1]
            header_lst = []
            th_stuff = final_th_header.find_all("th")
            for th_thing in th_stuff:
                curr_val = th_thing.get_text()
                header_lst.append(curr_val)
            curr_table = getovHelper(div)
            final_table = curr_table
            final_table.insert(0, header_lst)
            final_dict[curr_id] = final_table
    sleep(2)

    return (url_tip[0], final_dict)

Example #9

0

Show file

File: basketballCrawler.py Project: GeorgeSixx/basketballcrawler

def getCurrentPlayerNamesAndURLS(suppressOutput=True):

    names = []

    for letter in string.ascii_lowercase:
        letter_page = getSoupFromURL('http://www.basketball-reference.com/players/%s/' % (letter), suppressOutput)

        # we know that all the currently active players have <strong> tags, so we'll limit our names to those
        current_names = letter_page.findAll('strong')
        for n in current_names:
            name_data = n.children.next()
            names.append((name_data.contents[0], 'http://www.basketball-reference.com' + name_data.attrs['href']))
        time.sleep(1) # sleeping to be kind for requests

    return dict(names)

Example #10

0

Show file

File: basketballCrawler.py Project: nhbala/nba_pred

def dfFromGameLogURL(url):
    """
    Takes a url of a player's game log for a given year, returns a DataFrame
    """
    sleep(1)
    glsoup = getSoupFromURL(url)

    reg_season_table = glsoup.find_all(
        'table', id="pgl_basic")  # id for reg season table
    playoff_table = find_playoff_table(glsoup)

    # parse the table header.  we'll use this for the creation of the DataFrame
    header = []
    if len(reg_season_table) > 0 and reg_season_table[0] is not None:
        table_header = reg_season_table[0].find("thead")
    else:
        print("Error retrieving game log from:")
        print(url)
        exit(1)
    for th in table_header.find_all('th'):
        # if not th.getText() in header:
        header.append(th.getText())

    # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly

    header.insert(5, 'HomeAway')
    header.insert(8, 'WinLoss')
    header.pop(0)
    header.remove('\xa0')
    header.remove('\xa0')

    reg = soupTableToDF(reg_season_table, header)
    playoff = soupTableToDF(playoff_table, header)

    if reg is None:
        return playoff
    elif playoff is None:
        return reg
    else:
        try:
            return pd.concat([reg, playoff])
        except Exception as e:
            print("ERROR - Couldn't merge dataframes:", e)
            print(reg)
            print(playoff)
            return None

Example #11

0

Show file

File: basketballCrawler.py Project: toddgaron/basketballcrawler

def dfFromGameLogURL(url):
    """
    Takes a url of a player's game log for a given year, returns a DataFrame
    """
    glsoup = getSoupFromURL(url)

    reg_season_table = glsoup.findAll('table',
                                      attrs={'id': 'pgl_basic'
                                             })  # id for reg season table
    playoff_table = glsoup.find_all(
        string=lambda text: isinstance(text, Comment))
    try:
        playoff_table = BeautifulSoup(
            filter(lambda x: 'pgl_basic_playoffs' in x, playoff_table)[0])
        playoff_table = playoff_table.findAll(
            'table', attrs={'id':
                            'pgl_basic_playoffs'})  # id for playoff table
    except:
        playoff_table = []

    # parse the table header.  we'll use this for the creation of the DataFrame
    header = []
    for th in reg_season_table[0].findAll('th'):
        if not th.getText() in header:
            try:
                int(th.getText())
            except:
                header.append(th.getText())

    # add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly

    header[5] = u'HomeAway'
    header.insert(7, u'WinLoss')

    reg = soupTableToDF(reg_season_table, header)
    playoff = soupTableToDF(playoff_table, header)

    if reg is None:
        return playoff
    elif playoff is None:
        return reg
    else:
        return pd.concat([reg, playoff])

Example #12

0

Show file

def getAllCoaches(suppressOutput=True, min_year_active=2004):

    coaches = dict()
    glsoup = getSoupFromURL('https://www.basketball-reference.com/coaches/', suppressOutput)
    all_rows = glsoup.find("table", id="coaches").find("tbody").find_all("tr")
    for row in all_rows:
        coach = row.find("th", attrs={"data-stat": "coach", "scope": "row"})
        if coach is None:
            continue
        coach = coach.find("a")
        name = coach.get_text()
        last_year_active_soup = row.find("td", attrs={"data-stat": "year_max"})
        last_year_active = int(last_year_active_soup.get_text())
        try:
            if last_year_active >= min_year_active:
                coaches[name] = Coach(name, 'https://www.basketball-reference.com' + coach.attrs['href'])
        except Exception as e:
            print("ERROR:", e)
    sleep(1) # sleeping to be kind for requests
    return coaches

Example #13

0

Show file

def getCurrentPlayerNamesAndURLS(suppressOutput=True):

    names = []

    for letter in string.ascii_lowercase:
        letter_page = getSoupFromURL('https://www.basketball-reference.com/players/%s/' % (letter), suppressOutput)
        if letter_page is None:
            continue

        # we know that all the currently active players have <strong> tags, so we'll limit our names to those
        current_names = letter_page.findAll('strong')
        for n in current_names:
            name_data = n.children.next()
            try:
                names.append((name_data.contents[0], 'https://www.basketball-reference.com' + name_data.attrs['href']))
            except Exception as e:
                pass
        sleep(1)  # sleeping to be kind for requests

    return dict(names)