import copper import pandas as pd import requests from bs4 import BeautifulSoup copper.project.path = '../../' url = 'http://espn.go.com/nba/teams' r = requests.get(url) soup = BeautifulSoup(r.text) tables = soup.find_all('ul', class_='medium-logos') teams = [] prefix_1 = [] prefix_2 = [] for table in tables: lis = table.find_all('li') for li in lis: info = li.h5.a teams.append(info.text) url = info['href'] prefix_1.append(url.split('/')[-2]) prefix_2.append(url.split('/')[-1]) dic = {'prefix_2': prefix_2, 'prefix_1': prefix_1} teams = pd.DataFrame(dic, index=teams) teams.index.name = 'name' print(teams) copper.save(teams, 'teams')
else: home_team_score.append(_score[0]) visit_team_score.append(_score[1]) # Extra stats # r = requests.get(BASE_GAME_URL.format(_id)) # table = BeautifulSoup(r.text).find('table', class_='mod-data') # heads = table.find_all('thead') # bodies = table.find_all('tbody') # # print(heads) # headers = heads[2].tr.find_all('th')[2:] # headers = [th.text for th in headers] # headers[3] = headers[3].split('\n')[0] # del headers[-2] # visit_stats = bodies[2].tr.find_all('td')[1:] # visit_stats = [td.text for td in visit_stats] # del visit_stats[-2] # print(headers) # print(visit_stats) except Exception as e: pass # Not all columns row are a game, is OK # print(e) dic = {'id': game_id, 'date': dates, 'home_team': home_team, 'visit_team': visit_team, 'home_team_score': home_team_score, 'visit_team_score': visit_team_score} games = pd.DataFrame(dic).drop_duplicates(cols='id').set_index('id') print(games) copper.save(games, 'games.csv')
array[i, j] = cols[j].text frame = pd.DataFrame(columns=columns) for x in array: line = np.concatenate(([index, team_name], x)).reshape(1, len(columns)) new = pd.DataFrame(line, columns=frame.columns) frame = frame.append(new) return frame for index, row in games.iterrows(): print(index) request = requests.get(BASE_URL.format(index)) table = BeautifulSoup(request.text).find("table", class_="mod-data") heads = table.find_all("thead") bodies = table.find_all("tbody") team_1 = heads[0].th.text team_1_players = bodies[0].find_all("tr") + bodies[1].find_all("tr") team_1_players = get_players(team_1_players, team_1) players = players.append(team_1_players) team_2 = heads[3].th.text team_2_players = bodies[3].find_all("tr") + bodies[4].find_all("tr") team_2_players = get_players(team_2_players, team_2) players = players.append(team_2_players) players = players.set_index("id") print(players) copper.save(players, "players")
import copper import numpy as np import pandas as pd import matplotlib.pyplot as plt copper.project.path = '../' train = copper.Dataset() train.load('raw/train.csv') test = copper.Dataset() test.load('raw/test.csv') train.role['depend'] = train.TARGET copper.save(train, 'train') test.role['depend'] = test.TARGET copper.save(test, 'test') # Fill missing values using mean train.fillna(method='mean') copper.save(train, 'train_mean') test.fillna(method='mean') copper.save(test, 'test_mean') # print train.corr() # Histograms - Log transforms # ans = train.histogram('x18', legend=False) # print train['x26'] train['x18'] = train['x18'].map(np.log) train['x19'] = train['x19'].map(np.log)
import requests from bs4 import BeautifulSoup copper.project.path = '../../' url = 'http://espn.go.com/nba/teams' r = requests.get(url) soup = BeautifulSoup(r.text) tables = soup.find_all('ul', class_='medium-logos') teams = [] prefix_1 = [] prefix_2 = [] teams_urls = [] for table in tables: lis = table.find_all('li') for li in lis: info = li.h5.a teams.append(info.text) url = info['href'] teams_urls.append(url) prefix_1.append(url.split('/')[-2]) prefix_2.append(url.split('/')[-1]) dic = {'url': teams_urls, 'prefix_2': prefix_2, 'prefix_1': prefix_1} teams = pd.DataFrame(dic, index=teams) teams.index.name = 'team' print(teams) copper.save(teams, 'teams')
import copper import numpy as np import pandas as pd import matplotlib.pyplot as plt copper.project.path = "../" train = copper.Dataset() train.load("exported/train_imp.csv") test = copper.Dataset() test.load("exported/test_imp.csv") train.role["Unnamed: 0"] = train.REJECTED train.role["Unnamed: 0.1"] = train.REJECTED train.role["depend"] = train.TARGET copper.save(train, "train_imp") print train test.role["Unnamed: 0"] = test.REJECTED test.role["Unnamed: 0.1"] = test.REJECTED test.role["depend"] = test.TARGET copper.save(test, "test_imp")
def save(self, filename): copper.save(self, filename)
from bs4 import BeautifulSoup # copper.project.path = '../../' url = "http://espn.go.com/college-football/teams" r = requests.get(url) soup = BeautifulSoup(r.text) tables = soup.find_all("ul", class_="medium-logos") teams = [] prefix_1 = [] prefix_2 = [] teams_urls = [] for table in tables: lis = table.find_all("li") for li in lis: info = li.h5.a teams.append(info.text) url = info["href"] teams_urls.append(url) prefix_1.append(url.split("/")[-2]) prefix_2.append(url.split("/")[-1]) dic = {"url": teams_urls, "prefix_2": prefix_2, "prefix_1": prefix_1} teams = pd.DataFrame(dic, index=teams) teams.index.name = "team" print(teams) copper.save(teams, "teams")
frame = pd.DataFrame(columns=columns) for x in array: line = np.concatenate(([index, team_name], x)).reshape(1, len(columns)) new = pd.DataFrame(line, columns=frame.columns) frame = frame.append(new) return frame for index, row in games[:3].iterrows(): # for index, row in games.iterrows(): print(index) request = requests.get(BASE_URL.format(index)) table = BeautifulSoup(request.text).find('table', class_='mod-data') heads = table.find_all('thead') bodies = table.find_all('tbody') team_1 = heads[0].th.text team_1_players = bodies[0].find_all('tr') + bodies[1].find_all('tr') team_1_players = get_players(team_1_players, team_1) players = players.append(team_1_players) team_2 = heads[3].th.text team_2_players = bodies[3].find_all('tr') + bodies[4].find_all('tr') team_2_players = get_players(team_2_players, team_2) players = players.append(team_2_players) players = players.set_index('id') print(players) copper.save(players, 'players')
array[i, j] = cols[j].text frame = pd.DataFrame(columns=columns) for x in array: line = np.concatenate(([index, team_name], x)).reshape(1,len(columns)) new = pd.DataFrame(line, columns=frame.columns) frame = frame.append(new) return frame for index, row in games[:3].iterrows(): # for index, row in games.iterrows(): print(index) request = requests.get(BASE_URL.format(index)) table = BeautifulSoup(request.text).find('table', class_='mod-data') heads = table.find_all('thead') bodies = table.find_all('tbody') team_1 = heads[0].th.text team_1_players = bodies[0].find_all('tr') + bodies[1].find_all('tr') team_1_players = get_players(team_1_players, team_1) players = players.append(team_1_players) team_2 = heads[3].th.text team_2_players = bodies[3].find_all('tr') + bodies[4].find_all('tr') team_2_players = get_players(team_2_players, team_2) players = players.append(team_2_players) players = players.set_index('id') print(players) copper.save(players, 'players')