def test_1(self): ''' Tests Numberic and category inputs2ml Tets Numberic and categorical targets ''' self.setUpData() ds = copper.Dataset('transforms/ml/data.csv') # Test: Inputs: Numeric and categorical ds.type['Num.as.Cat'] = ds.CATEGORY ds.role['Target.Num'] = ds.REJECTED ds.role['Target.Cat'] = ds.REJECTED sol = copper.read_csv('transforms/ml/ml.csv') del sol['Target.Num'] del sol['Target.Cat'] self.assertEqual(copper.transform.inputs2ml(ds), sol) # Tests: Numeric target sol = copper.read_csv('transforms/ml/ml.csv')['Target.Num'] ds.role['Target.Num'] = ds.TARGET ds.role['Target.Cat'] = ds.REJECTED self.assertEqual(copper.transform.target2ml(ds), sol) # Tests: Categorical target sol = copper.read_csv('transforms/ml/ml.csv')['Target.Cat'] ds.role['Target.Num'] = ds.REJECTED ds.role['Target.Cat'] = ds.TARGET self.assertEqual(copper.transform.target2ml(ds), sol)
def test_to_number(self): self.setUpData() data = copper.read_csv('transforms/1/data.csv') sol = copper.read_csv('transforms/1/transformed.csv') t1 = data['Number.1'].apply(copper.transform.to_number) self.assertEqual(t1, sol['Number.1']) t2 = data['Number.2'].apply(copper.transform.to_number) self.assertEqual(t2, sol['Number.2'])
def test_strptime(self): self.setUpData() data = copper.read_csv('transforms/1/data.csv') sol = copper.read_csv('transforms/1/transformed.csv') dates1 = data['Date.1'].apply(copper.transform.strptime, args='%Y-%m-%d') dates2 = data['Date.2'].apply(copper.transform.strptime, args='%Y/%m/%d') dates3 = data['Date.3'].apply(copper.transform.strptime, args=('%m/%d/%y')) dates1, dates2, dates3 = dates1.dropna(), dates2.dropna(), dates3.dropna() self.assertEqual(len(dates1), 12) self.assertEqual(dates1.values, dates2.values) self.assertEqual(dates2.values, dates3.values) self.assertEqual(dates1.values, dates3.values)
def test_join(self): """ Tests join of different datasets """ self.setUpData() ds_all = copper.Dataset("dataset/1/data.csv") df = copper.read_csv("dataset/1/data.csv") l = len(df.columns) ds1 = copper.Dataset(df.ix[:, 0 : int(l / 4)]) ds2 = copper.Dataset(df.ix[:, int(l / 4) : int(2 * (l / 4))]) ds3 = copper.Dataset(df.ix[:, int(2 * (l / 4)) : int(3 * (l / 4))]) ds4 = copper.Dataset(df.ix[:, int(3 * (l / 4)) : int(4 * (l / 4))]) ds = copper.join(ds1, ds2, others=[ds3, ds4]) self.assertEqual(ds, ds_all) # 2. Change value of a section, the change should be reflected on the joined # Note: increasing the data.csv file probably will have to change ds2 to ds1 below ds2.type["Cat.1"] = ds.NUMBER ds2.update() ds_all.type["Cat.1"] = ds.NUMBER ds_all.update() ds = copper.join(ds1, ds2, others=[ds3, ds4]) self.assertEqual(ds, ds_all)
def test_costs(self): ''' Tests the values of the costs functions ''' self.setup() self.ml.costs = [[0, 4], [12, 16]] profit = copper.read_csv('ml/1/profit.csv').set_index('Model') self.assertEqual(self.ml.profit(), profit) oportunity_cost = copper.read_csv('ml/1/oportunity_cost.csv').set_index('Model')['Oportuniy cost'] self.assertEqual(self.ml.oportunity_cost(), oportunity_cost) cost_no_ml = copper.read_csv('ml/1/cost_no_ml.csv').set_index('Model')['Costs of not using ML'] self.assertEqual(self.ml.cost_no_ml(), cost_no_ml)
def test_pandas(self): """ Test basic functionality of pandas 1. Get/Set columns 2. Head/Tail 3. Correlation matrix """ self.setUpData() ds = copper.Dataset("dataset/1/data.csv") df = copper.read_csv("dataset/1/data.csv") # 1.1 Get columns for col in df.columns: self.assertEqual(ds[col], df[col]) # 1.2 Set columns - already existing columns only ds["Number.1"] = ds["Number.1"] - 10 df["Number.1"] = df["Number.1"] - 10 self.assertEqual(df, ds.frame) fnc = lambda x: 12 * (2007 - int(str(x)[0:4])) - int(str(x)[4:6]) + 2 ds["Date.Encoded"] = ds["Date.Encoded"].apply(fnc) df["Date.Encoded"] = df["Date.Encoded"].apply(fnc) self.assertEqual(df, ds.frame) # 2. Head/Tail self.assertEqual(ds.head(), df.head()) self.assertEqual(ds.head(13), df.head(13)) self.assertEqual(ds.tail(), df.tail()) self.assertEqual(ds.tail(9), df.tail(9)) # 3. Correlation matrix self.assertEqual(ds.corr(), df.corr())
def test_cat2num(self): """ Tests the automatic transformation of a Category to Number. More tests can be found on the tranformation tests. """ self.setUpData() ds = copper.Dataset("dataset/1/data.csv") sol = copper.read_csv("dataset/1/transformed.csv") # Test the imported metadata self.assertEqual(ds.type["Number.1"], ds.NUMBER) self.assertEqual(ds.type["Number.2"], ds.NUMBER) self.assertEqual(ds.type["Cat.1"], ds.CATEGORY) self.assertEqual(ds.type["Cat.2"], ds.CATEGORY) self.assertEqual(ds.type["Num.as.Cat"], ds.CATEGORY) self.assertEqual(ds.type["Money"], ds.CATEGORY) # Change test 1 ds.type["Num.as.Cat"] = ds.NUMBER self.assertEqual(ds.type["Num.as.Cat"], ds.NUMBER) ds.update() self.assertEqual(ds["Num.as.Cat"], sol["Num.as.Cat"]) # Change test 2: ds.type["Money"] = ds.NUMBER self.assertEqual(ds.type["Money"], ds.NUMBER) ds.update() self.assertEqual(ds["Money"], sol["Money"])
def test_1_cat_2_num(self): ''' Tests: * Initial metadata * Automatic category to number transformation * metadata * most of the values are converted to number * values that cannot be converted become nan ''' self.setUpData() ds = copper.Dataset('dataset/1/data.csv') sol = copper.read_csv('dataset/1/transform_filled.csv') self.assertEqual(ds.type['Number.1'], ds.NUMBER) self.assertEqual(ds.type['Number.2'], ds.NUMBER) self.assertEqual(ds.type['Cat.1'], ds.CATEGORY) self.assertEqual(ds.type['Cat.2'], ds.CATEGORY) self.assertEqual(ds.type['Num.as.Cat'], ds.CATEGORY) ds.type['Num.as.Cat'] = ds.NUMBER ds.update() # Test the metadata self.assertEqual(ds.type['Num.as.Cat'], ds.NUMBER) # Test the values self.assertEqual(ds['Num.as.Cat'], sol['Num.as.Cat'])
def test_fillna(self): """ Tests: * Fill na in type=Number * Fill na in type=Category """ self.setUpData() ds = copper.Dataset("dataset/1/data.csv") sol = copper.read_csv("dataset/1/transform_filled.csv") # Number.1 does not have missing values prev = ds["Number.1"] ds.fillna(cols="Number.1", method="mean") self.assertEqual(ds["Number.1"], sol["Number.1"]) # Number.2 does have missing values ds.fillna(cols="Number.2", method="mean") self.assertEqual(ds["Number.2"], sol["Number.2"]) # Cat.1 does have missing values ds.fillna(cols="Cat.1", method="mode") self.assertEqual(ds["Cat.1"], sol["Cat.1"]) # Cat.2 does NOT have missing values ds.fillna(cols="Cat.2", method="mode") self.assertEqual(ds["Cat.2"], sol["Cat.2"])
def test_predict(self): ''' Tests the prediction and prediction probabilities Tests that using the defaul option is the same as using the given test Tests that is possible to predict other datasets ''' self.setup() predict_train = copper.read_csv('ml/1/predict_train.csv').set_index('Model') predict_test = copper.read_csv('ml/1/predict_test.csv').set_index('Model') predict_proba_train = copper.read_csv('ml/1/predict_proba_train.csv').set_index('index') predict_proba_test = copper.read_csv('ml/1/predict_proba_test.csv').set_index('index') self.assertEqual(self.mc.predict(), predict_test) self.assertEqual(self.mc.predict(ds=self.test), predict_test) self.assertEqual(self.mc.predict(self.train), predict_train) self.assertEqual(self.mc.predict_proba(), predict_proba_test, 1) self.assertEqual(self.mc.predict_proba(self.test), predict_proba_test, 1) self.assertEqual(self.mc.predict_proba(ds=self.train), predict_proba_train, 1)
def test_date2number(self): ''' Requires: transforms.strptime ''' self.setUpData() data = copper.read_csv('transforms/1/data.csv') sol = copper.read_csv('transforms/1/transformed.csv') # Default startdate dates1 = data['Date.1'].apply(copper.transform.strptime, args='%Y-%m-%d') dates2 = data['Date.2'].apply(copper.transform.strptime, args='%Y/%m/%d') dates3 = data['Date.3'].apply(copper.transform.strptime, args='%m/%d/%y') nums1 = dates1.apply(copper.transform.date_to_number) nums2 = dates2.apply(copper.transform.date_to_number) nums3 = dates3.apply(copper.transform.date_to_number) self.assertEqual(nums1.values, nums3.values) self.assertEqual(nums2.values, nums3.values) self.assertEqual(nums1.values, nums3.values) ans_1 = 13879 self.assertEqual(nums1[0], ans_1) self.assertEqual(nums2[0], ans_1) self.assertEqual(nums3[0], ans_1) # Custom startdate from datetime import datetime copper.transform.start_date = datetime(2000, 1, 1) nums1_2 = dates1.apply(copper.transform.date_to_number) nums2_2 = dates2.apply(copper.transform.date_to_number) nums3_2 = dates3.apply(copper.transform.date_to_number) self.assertEqual(nums1_2.values, nums2_2.values) self.assertEqual(nums2_2.values, nums3_2.values) self.assertEqual(nums1_2.values, nums3_2.values) ans_1_2 = 2922 self.assertEqual(nums1_2[0], ans_1_2) self.assertEqual(nums2_2[0], ans_1_2) self.assertEqual(nums3_2[0], ans_1_2) self.assertNotEqual(nums1[0], nums1_2[0]) self.assertNotEqual(nums2[0], nums1_2[0]) self.assertNotEqual(nums3[0], nums1_2[0])
def test_create(self): """ Different ways of creating a Dataset """ self.setUpData() ds1 = copper.Dataset("dataset/1/data.csv") df = copper.read_csv("dataset/1/data.csv") ds2 = copper.Dataset(df) ds3 = copper.Dataset() ds3.load("dataset/1/data.csv") self.assertEqual(ds1, ds2) self.assertEqual(ds2, ds3)
def test_1_filter(self): ''' Tests: Change of role types combination on filter ''' self.setUpData() ds = copper.Dataset('dataset/1/data.csv') df = copper.read_csv('dataset/1/data.csv') # 1. Initial frame self.assertEqual(ds.frame, df) # 2. No filters - Return everything self.assertEqual(ds.filter(), df) # 2.1 Reject a column but still no filters ds.role['Number.2'] = ds.REJECTED self.assertEqual(ds.filter(), df) # 3. Filter by inputs ds.role['Number.2'] = ds.REJECTED self.assertEqual(ds.filter(role=ds.INPUT), df[['Number.1', 'Cat.1', 'Cat.2', 'Num.as.Cat']]) # 3.1 Put the column back ds.role['Number.2'] = ds.INPUT self.assertEqual(ds.filter(role=ds.INPUT), df) # 4. Filter by Target - Inputs changed ds.role['Cat.1'] = ds.TARGET self.assertEqual(ds.filter(role=ds.TARGET), df[['Cat.1']]) self.assertEqual(ds.filter(role=ds.INPUT), df[['Number.1', 'Number.2', 'Cat.2', 'Num.as.Cat']]) # 5. Filter by type self.assertEqual(ds.filter(type=ds.NUMBER), df[['Number.1', 'Number.2']]) self.assertEqual(ds.filter(type=ds.CATEGORY), df[['Cat.1', 'Cat.2', 'Num.as.Cat']]) # 6. Filter by role and type ds.role['Cat.1'] = ds.TARGET self.assertEqual(ds.filter(role=ds.INPUT, type=ds.NUMBER), df[['Number.1', 'Number.2']]) self.assertEqual(ds.filter(role=ds.INPUT, type=ds.CATEGORY), df[['Cat.2', 'Num.as.Cat']]) self.assertEqual(ds.filter(role=ds.TARGET, type=ds.CATEGORY), df[['Cat.1']]) # Multiple roles self.assertEqual(ds.filter(role=[ds.INPUT, ds.TARGET]), df) # Multiple types self.assertEqual(ds.filter(type=[ds.NUMBER, ds.CATEGORY]), df) # Multiple roles and types self.assertEqual(ds.filter(role=[ds.INPUT, ds.TARGET], type=[ds.NUMBER, ds.CATEGORY]), df)
def test_fillna_2(self): """ Tests the fill of all columns at once 1. One column is REJECTED and therefore is not filled """ self.setUpData() ds = copper.Dataset("dataset/1/data.csv") sol = copper.read_csv("dataset/1/transform_filled.csv") ds.type["Num.as.Cat"] = ds.NUMBER ds.type["Money"] = ds.NUMBER ds.update() ds.fillna(method="mean") self.assertEqual(ds.frame, sol)
def test_create(self): ''' Tests the different ways of creating a Dataset ''' frame = pd.DataFrame(np.random.rand(5,5), index=np.arange(5)) frame.index.name = 'index' frame.to_csv('/tmp/temp.csv') ds1 = copper.Dataset('/tmp/temp.csv') df = copper.read_csv('/tmp/temp.csv') ds2 = copper.Dataset(df) ds3 = copper.Dataset() ds3.load('/tmp/temp.csv') ds4 = copper.Dataset() ds4.frame = df self.assertEqual(ds1, ds2) self.assertEqual(ds2, ds3) self.assertEqual(ds3, ds4)
def test_1_fillna(self): ''' Tests: * Fill na in type=Number * Fill na in type=Category ''' self.setUpData() ds = copper.Dataset('dataset/1/data.csv') sol = copper.read_csv('dataset/1/transform_filled.csv') # Number.1 does not have missing values prev = ds['Number.1'] ds.fillna(cols='Number.1', method='mean') self.assertEqual(ds['Number.1'], prev) # Number.2 does have missing values ds.fillna(cols='Number.2', method='mean') self.assertEqual(ds['Number.2'], sol['Number.2']) # Cat.1 does have missing values ds.fillna(cols='Cat.1', method='mode') self.assertEqual(ds['Cat.1'], sol['Cat.1'])
def test_cm(self): ''' Tests the values of the confusion matrixes ''' self.setup() cms = self.ml._cm() self.assertEqual(cms['GNB'], np.array([[1196, 236], [ 406, 162]])) self.assertEqual(cms['DT'], np.array([[1365, 67], [ 506, 62]])) self.assertEqual(cms['SVM'], np.array([[1362, 70], [ 531, 37]])) self.assertEqual(cms['GB'], np.array([[1387, 45], [ 515, 53]])) self.assertEqual(self.ml.cm('GNB').values, np.array([[1196, 236], [ 406, 162]])) self.assertEqual(self.ml.cm('DT').values, np.array([[1365, 67], [ 506, 62]])) self.assertEqual(self.ml.cm('SVM').values, np.array([[1362, 70], [ 531, 37]])) self.assertEqual(self.ml.cm('GB').values, np.array([[1387, 45], [ 515, 53]])) sol = copper.read_csv('ml/1/cm.csv').set_index('Model') sol.index.name = None self.assertEqual(self.ml.cm_table(), sol) cm_0 = sol.ix[:, sol.columns[0:3]].sort(['Rate 0\'s'], ascending=False) cm_1 = sol.ix[:, sol.columns[3:]].sort(['Rate 1\'s'], ascending=False) self.assertEqual(self.ml.cm_table(0), cm_0) self.assertEqual(self.ml.cm_table(1), cm_1)
import copper import numpy as np import pandas as pd import requests from bs4 import BeautifulSoup from datetime import datetime, date copper.project.path = '../..' games = copper.read_csv('games.csv').set_index('id') BASE_URL = 'http://espn.go.com/nba/boxscore?gameId={0}' request = requests.get(BASE_URL.format(games.index[0])) table = BeautifulSoup(request.text).find('table', class_='mod-data') heads = table.find_all('thead') headers = heads[0].find_all('tr')[1].find_all('th')[1:] headers = [th.text for th in headers] columns = ['id', 'team', 'player'] + headers players = pd.DataFrame(columns=columns) def get_players(players, team_name): array = np.zeros((len(players), len(headers)+1), dtype=object) array[:] = np.nan for i, player in enumerate(players): cols = player.find_all('td') array[i, 0] = cols[0].text.split(',')[0] for j in range(1, len(headers) + 1): if not cols[1].text.startswith('DNP'): array[i, j] = cols[j].text
# get each NBA player stats from ESPN # credit: http://danielfrg.com/blog/2013/04/01/nba-scraping-data/ import copper import numpy as np import pandas as pd import requests from bs4 import BeautifulSoup from datetime import datetime, date copper.project.path = "../.." games = copper.read_csv("games.csv").set_index("id") BASE_URL = "http://espn.go.com/nba/boxscore?gameId={0}" request = requests.get(BASE_URL.format(games.index[0])) table = BeautifulSoup(request.text).find("table", class_="mod-data") heads = table.find_all("thead") headers = heads[0].find_all("tr")[1].find_all("th")[1:] headers = [th.text for th in headers] columns = ["id", "team", "player"] + headers players = pd.DataFrame(columns=columns) def get_players(players, team_name): array = np.zeros((len(players), len(headers) + 1), dtype=object) array[:] = np.nan for i, player in enumerate(players): cols = player.find_all("td")
## Is this useful? def important_features(self, clf_name): clf = self._clfs[clf_name] importances = clf.feature_importances_ indices = np.argsort(importances)[::-1] plt.title("Feature importances") plt.bar(range(len(importances)), importances[indices], color="r", align="center") plt.xticks(range(len(importances)), indices) if __name__ == '__main__': # ''' DONORS copper.config.path = '../tests' ds = copper.read_csv('donors/data.csv') ds.role['TARGET_D'] = ds.REJECTED ds.role['TARGET_B'] = ds.TARGET ds.type['ID'] = ds.CATEGORY ds.fillna('DemAge', 'mean') ds.fillna('GiftAvgCard36', 'mean') ml = copper.MachineLearning() ml.dataset = ds ml.sample(trainSize=0.5) from sklearn import tree tree_clf = tree.DecisionTreeClassifier(compute_importances=True, max_depth=10) # ml.add_clf(tree_clf, 'DT')
def test_1_role_ml(self): ''' Depends on: test_1_fillna and test_1_cat_2_num Tests: 1. Initial roles are Input 2. Modify roles and returned frames are correct * Inputs are correct for machine learning 3. Target is correct ''' self.setUpData() ds = copper.Dataset('dataset/1/data.csv') df = copper.read_csv('dataset/1/data.csv') ml_df = copper.read_csv('dataset/1/ml.csv') # 1. Initial role self.assertEqual(ds.role['Number.1'], ds.INPUT) self.assertEqual(ds.role['Number.2'], ds.INPUT) self.assertEqual(ds.role['Cat.1'], ds.INPUT) self.assertEqual(ds.role['Cat.2'], ds.INPUT) self.assertEqual(ds.role['Num.as.Cat'], ds.INPUT) self.assertEqual(ds.frame, df) # Correct data ds.type['Num.as.Cat'] = ds.NUMBER ds.update() ds.fillna(method='mean') # 2. Inputs values are correct self.assertEqual(ds.inputs, ml_df) # 2.1 Modify roles ds.role['Number.1'] = ds.REJECTED self.assertEqual(ds.role['Number.1'], ds.REJECTED) self.assertEqual(ds.inputs, ml_df[ml_df.columns[1:]]) # 2.2 Modify roles ds.role['Number.2'] = ds.REJECTED self.assertEqual(ds.role['Number.2'], ds.REJECTED) self.assertEqual(ds.inputs, ml_df[ml_df.columns[2:]]) # 2.3 Modify roles ds.role['Cat.1'] = ds.REJECTED self.assertEqual(ds.role['Cat.1'], ds.REJECTED) self.assertEqual(ds.inputs, ml_df[ml_df.columns[4:]]) # 2.4 Modify roles ds.role['Cat.2'] = ds.REJECTED self.assertEqual(ds.role['Cat.2'], ds.REJECTED) self.assertEqual(ds.inputs, ml_df[ml_df.columns[6:]]) self.assertEqual(ds.inputs, ml_df[['Num.as.Cat']]) # 3. Target # Back to normal ds.role['Number.1'] = ds.INPUT ds.role['Number.2'] = ds.INPUT ds.role['Cat.1'] = ds.INPUT ds.role['Cat.2'] = ds.INPUT self.assertEqual(ds.inputs, ml_df) # Set Target ds.role['Num.as.Cat'] = ds.TARGET self.assertEqual(ds.inputs, ml_df[ml_df.columns[:-1]]) target = ml_df[ml_df.columns[-1]] target.name = 'Target' self.assertEqual(ds.target, target)
import copper import numpy as np import pandas as pd import requests from bs4 import BeautifulSoup from datetime import datetime, date copper.project.path = '../../' year = 2013 teams = copper.read_csv('teams.csv') BASE_URL = 'http://espn.go.com/nba/team/schedule/_/name/{0}/year/{1}/{2}' BASE_GAME_URL = 'http://espn.go.com/nba/boxscore?gameId={0}' game_id = [] dates = [] home_team = [] home_team_score = [] visit_team = [] visit_team_score = [] for index, row in teams.iterrows(): # for index, row in teams[:1].iterrows(): _team = row['name'] print(_team) r = requests.get(BASE_URL.format(row['prefix_1'], year, row['prefix_2'])) table = BeautifulSoup(r.text).table for row in table.find_all('tr')[1:]: # for row in table.find_all('tr')[1:3]: columns = row.find_all('td') try: _id = columns[2].a['href'].split('?id=')[1] _home = True if columns[1].li.text == 'vs' else False
import copper import numpy as np import pandas as pd import requests from bs4 import BeautifulSoup from datetime import datetime, date copper.project.path = '../..' games = copper.read_csv('games.csv').set_index('id') BASE_URL = 'http://espn.go.com/nba/boxscore?gameId={0}' request = requests.get(BASE_URL.format(games.index[0])) table = BeautifulSoup(request.text).find('table', class_='mod-data') heads = table.find_all('thead') headers = heads[0].find_all('tr')[1].find_all('th')[1:] headers = [th.text for th in headers] columns = ['id', 'team', 'player'] + headers players = pd.DataFrame(columns=columns) def get_players(players, team_name): array = np.zeros((len(players), len(headers) + 1), dtype=object) array[:] = np.nan for i, player in enumerate(players): cols = player.find_all('td') array[i, 0] = cols[0].text.split(',')[0] for j in range(1, len(headers) + 1): if not cols[1].text.startswith('DNP'): array[i, j] = cols[j].text