Ejemplo n.º 1
0
import copper
import pandas as pd
import requests
from bs4 import BeautifulSoup

copper.project.path = '../../'

url = 'http://espn.go.com/nba/teams'
r = requests.get(url)

soup = BeautifulSoup(r.text)
tables = soup.find_all('ul', class_='medium-logos')

teams = []
prefix_1 = []
prefix_2 = []
for table in tables:
    lis = table.find_all('li')
    for li in lis:
        info = li.h5.a
        teams.append(info.text)
        url = info['href']
        prefix_1.append(url.split('/')[-2])
        prefix_2.append(url.split('/')[-1])

dic = {'prefix_2': prefix_2, 'prefix_1': prefix_1}
teams = pd.DataFrame(dic, index=teams)
teams.index.name = 'name'
print(teams)
copper.save(teams, 'teams')
Ejemplo n.º 2
0
                else:
                    home_team_score.append(_score[0])
                    visit_team_score.append(_score[1])

            # Extra stats
            # r = requests.get(BASE_GAME_URL.format(_id))
            # table = BeautifulSoup(r.text).find('table', class_='mod-data')
            # heads = table.find_all('thead')
            # bodies = table.find_all('tbody')
            # # print(heads)
            # headers = heads[2].tr.find_all('th')[2:]
            # headers = [th.text for th in headers]
            # headers[3] = headers[3].split('\n')[0]
            # del headers[-2]
            # visit_stats = bodies[2].tr.find_all('td')[1:]
            # visit_stats = [td.text for td in visit_stats]
            # del visit_stats[-2]
            # print(headers)
            # print(visit_stats)

        except Exception as e:
            pass # Not all columns row are a game, is OK
            # print(e)

dic = {'id': game_id, 'date': dates, 'home_team': home_team, 'visit_team': visit_team, 
        'home_team_score': home_team_score, 'visit_team_score': visit_team_score}
        
games = pd.DataFrame(dic).drop_duplicates(cols='id').set_index('id')
print(games)
copper.save(games, 'games.csv')
Ejemplo n.º 3
0
                array[i, j] = cols[j].text

    frame = pd.DataFrame(columns=columns)
    for x in array:
        line = np.concatenate(([index, team_name], x)).reshape(1, len(columns))
        new = pd.DataFrame(line, columns=frame.columns)
        frame = frame.append(new)
    return frame


for index, row in games.iterrows():
    print(index)
    request = requests.get(BASE_URL.format(index))
    table = BeautifulSoup(request.text).find("table", class_="mod-data")
    heads = table.find_all("thead")
    bodies = table.find_all("tbody")

    team_1 = heads[0].th.text
    team_1_players = bodies[0].find_all("tr") + bodies[1].find_all("tr")
    team_1_players = get_players(team_1_players, team_1)
    players = players.append(team_1_players)

    team_2 = heads[3].th.text
    team_2_players = bodies[3].find_all("tr") + bodies[4].find_all("tr")
    team_2_players = get_players(team_2_players, team_2)
    players = players.append(team_2_players)

players = players.set_index("id")
print(players)
copper.save(players, "players")
Ejemplo n.º 4
0
import copper
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

copper.project.path = '../'
train = copper.Dataset()
train.load('raw/train.csv')
test = copper.Dataset()
test.load('raw/test.csv')

train.role['depend'] = train.TARGET
copper.save(train, 'train')

test.role['depend'] = test.TARGET
copper.save(test, 'test')

# Fill missing values using mean
train.fillna(method='mean')
copper.save(train, 'train_mean')

test.fillna(method='mean')
copper.save(test, 'test_mean')

# print train.corr()

# Histograms - Log transforms
# ans = train.histogram('x18', legend=False)
# print train['x26']
train['x18'] = train['x18'].map(np.log)
train['x19'] = train['x19'].map(np.log)
Ejemplo n.º 5
0
import requests
from bs4 import BeautifulSoup
copper.project.path = '../../'

url = 'http://espn.go.com/nba/teams'
r = requests.get(url)

soup = BeautifulSoup(r.text)
tables = soup.find_all('ul', class_='medium-logos')

teams = []
prefix_1 = []
prefix_2 = []
teams_urls = []
for table in tables:
    lis = table.find_all('li')
    for li in lis:
        info = li.h5.a
        teams.append(info.text)
        url = info['href']
        teams_urls.append(url)
        prefix_1.append(url.split('/')[-2])
        prefix_2.append(url.split('/')[-1])


dic = {'url': teams_urls, 'prefix_2': prefix_2, 'prefix_1': prefix_1}
teams = pd.DataFrame(dic, index=teams)
teams.index.name = 'team'
print(teams)
copper.save(teams, 'teams')
Ejemplo n.º 6
0
import copper
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

copper.project.path = "../"
train = copper.Dataset()
train.load("exported/train_imp.csv")
test = copper.Dataset()
test.load("exported/test_imp.csv")

train.role["Unnamed: 0"] = train.REJECTED
train.role["Unnamed: 0.1"] = train.REJECTED
train.role["depend"] = train.TARGET
copper.save(train, "train_imp")
print train

test.role["Unnamed: 0"] = test.REJECTED
test.role["Unnamed: 0.1"] = test.REJECTED
test.role["depend"] = test.TARGET
copper.save(test, "test_imp")
Ejemplo n.º 7
0
 def save(self, filename):
     copper.save(self, filename)
Ejemplo n.º 8
0
from bs4 import BeautifulSoup

# copper.project.path = '../../'

url = "http://espn.go.com/college-football/teams"
r = requests.get(url)

soup = BeautifulSoup(r.text)
tables = soup.find_all("ul", class_="medium-logos")

teams = []
prefix_1 = []
prefix_2 = []
teams_urls = []
for table in tables:
    lis = table.find_all("li")
    for li in lis:
        info = li.h5.a
        teams.append(info.text)
        url = info["href"]
        teams_urls.append(url)
        prefix_1.append(url.split("/")[-2])
        prefix_2.append(url.split("/")[-1])


dic = {"url": teams_urls, "prefix_2": prefix_2, "prefix_1": prefix_1}
teams = pd.DataFrame(dic, index=teams)
teams.index.name = "team"
print(teams)
copper.save(teams, "teams")
Ejemplo n.º 9
0
    frame = pd.DataFrame(columns=columns)
    for x in array:
        line = np.concatenate(([index, team_name], x)).reshape(1, len(columns))
        new = pd.DataFrame(line, columns=frame.columns)
        frame = frame.append(new)
    return frame


for index, row in games[:3].iterrows():
    # for index, row in games.iterrows():
    print(index)
    request = requests.get(BASE_URL.format(index))
    table = BeautifulSoup(request.text).find('table', class_='mod-data')
    heads = table.find_all('thead')
    bodies = table.find_all('tbody')

    team_1 = heads[0].th.text
    team_1_players = bodies[0].find_all('tr') + bodies[1].find_all('tr')
    team_1_players = get_players(team_1_players, team_1)
    players = players.append(team_1_players)

    team_2 = heads[3].th.text
    team_2_players = bodies[3].find_all('tr') + bodies[4].find_all('tr')
    team_2_players = get_players(team_2_players, team_2)
    players = players.append(team_2_players)

players = players.set_index('id')
print(players)
copper.save(players, 'players')
Ejemplo n.º 10
0
                array[i, j] = cols[j].text
    
    frame = pd.DataFrame(columns=columns)
    for x in array:
        line = np.concatenate(([index, team_name], x)).reshape(1,len(columns))
        new = pd.DataFrame(line, columns=frame.columns)
        frame = frame.append(new)
    return frame

for index, row in games[:3].iterrows():
# for index, row in games.iterrows():
    print(index)
    request = requests.get(BASE_URL.format(index))
    table = BeautifulSoup(request.text).find('table', class_='mod-data')
    heads = table.find_all('thead')
    bodies = table.find_all('tbody')

    team_1 = heads[0].th.text
    team_1_players = bodies[0].find_all('tr') + bodies[1].find_all('tr')
    team_1_players = get_players(team_1_players, team_1)
    players = players.append(team_1_players)
    
    team_2 = heads[3].th.text
    team_2_players = bodies[3].find_all('tr') + bodies[4].find_all('tr')
    team_2_players = get_players(team_2_players, team_2)
    players = players.append(team_2_players)

players = players.set_index('id')
print(players)
copper.save(players, 'players')