def test_iris_accuracy(): dbn = DBN([25], pretrain_epochs=0, finetune_epochs=10, finetune_batch_size=10, random_state=1) X, y = get_iris() dbn.fit(X, y) acc = (dbn.predict(X) == y).mean() eq_(acc, 0.95333, 5)
def test_set_column(): df = pd.DataFrame(np.random.rand(5, 10)) ds = copper.Dataset(df) new_col = np.random.rand(5, 1) eq_(ds[3].values, df[3].values) ds[3] = new_col eq_(ds[[3]].values, new_col)
def test_update_cat_to_num_float(): sol = np.arange(100) / 100 strings = np.array(['a(%f)' % d for d in sol]) df = pd.DataFrame(strings) ds = copper.Dataset(df) ds.type[0] = ds.NUMBER ds.update() eq_(sol, ds[0].values)
def test_transform_float(): array = np.arange(10) / 10 strings = [] for i, item in enumerate(array): strings.append("STRING(%f)" % item) ser = pd.Series(strings) sol = pd.Series(array) eq_(ser.apply(copper.t.to_float), sol)
def test_set_frame_different_cols_fail(): # By failing it checks that the metadata is different == was recreated df1 = pd.DataFrame(np.random.rand(5, 5)) ds = copper.Dataset(df1) meta_old = ds.metadata.copy() df2 = pd.DataFrame(np.random.rand(10, 10)) ds.frame = df2 eq_(ds.metadata, meta_old)
def get_iris_ds_string(): ds = get_iris_ds() ds.type['Target'] = ds.CATEGORY ds['Target'] = ds['Target'].apply(lambda x: str(x)) ds['Target'][ds['Target'] == '0'] = 'Iris-A' ds['Target'][ds['Target'] == '1'] = 'Iris-B' ds['Target'][ds['Target'] == '2'] = 'Iris-C' eq_(ds.metadata['dtype']['Target'], object) return ds
def test_ml_target_number(): df = pd.DataFrame(np.random.rand(8, 6)) ds = copper.Dataset(df) target_col = math.floor(random.random() * 6) ds.role[target_col] = ds.TARGET le, target = copper.t.ml_target(ds) eq_(target, ds[target_col].values) eq_(le, None)
def test_set_frame_different_length_same_cols_fail(): # By failing is testing that the default metadata is not in place df1 = pd.DataFrame(np.random.rand(5, 5)) ds = copper.Dataset(df1.copy()) default_meta = ds.metadata.copy() ds.role[[2, 4]] = ds.TARGET ds.type[[1, 2]] = ds.CATEGORY df2 = pd.DataFrame(np.random.rand(10, 5)) ds.frame = df2 eq_(ds.metadata, default_meta)
def test_set_frame_different_length_same_cols(): # Tests that the metadata is mantained if columns are the same df1 = pd.DataFrame(np.random.rand(5, 5)) ds = copper.Dataset(df1.copy()) ds.role[[2, 4]] = ds.TARGET ds.type[[1, 2]] = ds.CATEGORY meta_old = ds.metadata.copy() df2 = pd.DataFrame(np.random.rand(10, 5)) ds.frame = df2 eq_(ds.metadata, meta_old)
def test_ml_target_string(): df = pd.DataFrame(np.random.rand(6, 6)) strings = ['z', 'h', 'z', 'c', 'h', 'c'] sol = [2, 1, 2, 0, 1, 0] df['T'] = strings ds = copper.Dataset(df) ds.role['T'] = ds.TARGET le, target = copper.t.ml_target(ds) eq_(target, np.array(sol)) eq_(le.classes_.tolist(), ['c', 'h', 'z'])
def test_default_type(): df = pd.DataFrame(np.random.rand(5, 20)) rand_col = math.floor(random.random() * 20) rand_col2 = math.floor(random.random() * 20) df[rand_col] = df[rand_col].apply(lambda x: str(x)) df[rand_col2] = df[rand_col].apply(lambda x: str(x)) ds = copper.Dataset(df) eq_(ds.type[rand_col], ds.CATEGORY) for col in ds.columns: if col not in (rand_col, rand_col2): eq_(ds.type[col], ds.NUMBER)
def test_copy_metadata(): cols = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] df1 = pd.DataFrame(np.random.rand(5, 10), columns=cols) ds1 = copper.Dataset(df1) ds1.role[['c', 'd', 'h', 'i']] = ds1.TARGET ds1.type[['b', 'c', 'g', 'i']] = ds1.CATEGORY # meta_old = ds1.metadata.copy() df2 = pd.DataFrame(np.random.rand(5, 10), columns=cols) ds2 = copper.Dataset(df2) ds2.copy_metadata(ds1.metadata) eq_(ds2.metadata, ds1.metadata)
def test_ml_target_more_than_one(): df = pd.DataFrame(np.random.rand(8, 6)) ds = copper.Dataset(df) ds.role[3] = ds.TARGET ds.role[5] = ds.TARGET import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") le, target = copper.t.ml_target(ds) eq_(le, None) eq_(target, ds[3].values)
def test_create_empty(): # Checks empty Dataframes ds = copper.Dataset() eq_(ds.role, pd.Series()) eq_(ds.type, pd.Series()) eq_(ds.frame.empty, True) eq_(ds.metadata.empty, True)
def test_save_load_weights(): import tempfile tempdir = tempfile.gettempdir() tempfile = os.path.join(tempdir, 'w.json') # tempfile = os.path.join('', 'w.json') dbn1 = DBN([5], random_state=1234) X, y = get_iris() dbn1.fit(X, y) pred1 = dbn1.predict(X) prob1 = dbn1.predict_proba(X) dbn1.save(tempfile) dbn2 = DBN([5]) dbn2.load(tempfile) pred2 = dbn2.predict(X) prob2 = dbn2.predict_proba(X) eq_(dbn1.coef_, dbn2.coef_) for i, layer in enumerate(dbn1.layers): eq_(dbn1.layers[i].W, dbn2.layers[i].W) eq_(pred1, pred2) eq_(prob1, prob2)
def test_fbeta_score(mc=None): mc = get_mc() if mc is None else mc score = mc.fbeta_score(beta=0.1) eq_(score['SVM'], 0.976249, 6) eq_(score['LR'], 0.914067, 6) score = mc.fbeta_score() eq_(score['SVM'], 0.973952, 6) eq_(score['LR'], 0.870540, 6)
def test_filter_role(): df = pd.DataFrame(np.random.rand(5, 10)) ds = copper.Dataset(df) ds.role[[0, 2, 4, 5, 9]] = ds.IGNORE eq_(ds.filter(role=ds.INPUT), ds[[1, 3, 6, 7, 8]]) ds.role[:] = ds.IGNORE ds.role[[1, 3, 4, 6, 8]] = ds.INPUT eq_(ds.filter(role=ds.INPUT), ds[[1, 3, 4, 6, 8]]) ds.role[[2, 9]] = ds.TARGET eq_(ds.filter(role=ds.TARGET), ds[[2, 9]]) eq_(ds.filter(role=[ds.INPUT, ds.TARGET]), ds[[1, 2, 3, 4, 6, 8, 9]]) eq_(ds.filter(), df)
def test_coef_eq_layers_1(): dbn = DBN([5], pretrain_epochs=0, finetune_epochs=1, random_state=1234) X, y = get_iris() dbn.fit(X, y) eq_(dbn.coef_[:5], dbn.layers[0].b) eq_(dbn.coef_[5:25], dbn.layers[0].W.reshape(-1)) eq_(dbn.coef_[25:28], dbn.layers[1].b) eq_(dbn.coef_[28:], dbn.layers[1].W.reshape(-1))
def get_mc_string(): ds = get_iris_ds() ds.type['Target'] = ds.CATEGORY ds['Target'] = ds['Target'].apply(lambda x: str(x)) ds['Target'][ds['Target'] == '0'] = 'Iris-setosa' ds['Target'][ds['Target'] == '1'] = 'Iris-versicolor' ds['Target'][ds['Target'] == '2'] = 'Iris-virginica' eq_(ds.metadata['dtype']['Target'], object) mc = copper.ModelComparison() mc.train_test_split(ds, random_state=0) from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC mc['LR'] = LogisticRegression() mc['SVM'] = SVC(probability=True) mc.fit() return mc
def test_ml_inputs_big(): abc = 'abcdefghijklmnopqrstuvwxyz' m, n = 1000, 10 array = np.floor(np.random.rand(m) * 26) strings = np.array([abc[int(i)] for i in array]) df = pd.DataFrame(np.random.rand(m, 100)) abc_cols = np.arange(n) * 10 for col in abc_cols: df[col] = strings ds = copper.Dataset(df) ds.type[abc_cols.tolist()] = ds.CATEGORY ans = copper.t.ml_inputs(ds) eq_(ans.shape, (m, 100 - n + 26 * n)) encoded = copper.t.cat_encode(strings) for i, abc_col in enumerate(abc_cols): s = abc_col + 25 * i f = abc_col + 25 * i + 26 eq_(ans[:, s:f], encoded)
def test_filter_role_and_type(): df = pd.DataFrame(np.random.rand(5, 5)) ds = copper.Dataset(df) ds.role[:] = ds.IGNORE ds.role[2] = ds.INPUT ds.type[2] = ds.CATEGORY eq_(ds.filter(role=ds.INPUT, type=ds.CATEGORY), df[[2]]) ds.role[4] = ds.INPUT ds.type[4] = ds.CATEGORY eq_(ds.filter(role=ds.INPUT, type=ds.CATEGORY), df[[2, 4]]) eq_(ds.filter(role=ds.IGNORE, type=ds.NUMBER), df[[0, 1, 3]]) ds.role[4] = ds.IGNORE eq_(ds.filter(role=ds.INPUT, type=ds.CATEGORY), df[[2]]) eq_(ds.filter(), df)
def test_filter_type(): df = pd.DataFrame(np.random.rand(5, 10)) ds = copper.Dataset(df) ds.type[[0, 2, 4, 5, 9]] = ds.CATEGORY eq_(ds.filter(type=ds.CATEGORY), ds[[0, 2, 4, 5, 9]]) ds.type[:] = ds.CATEGORY ds.type[[1, 3, 6, 7, 9]] = ds.NUMBER eq_(ds.filter(type=ds.NUMBER), ds[[1, 3, 6, 7, 9]]) eq_(ds.filter(type=[ds.NUMBER, ds.CATEGORY]), df) eq_(ds.filter(), df)
def test_set_frame_different_cols(): # Checks default metadata is placed df1 = pd.DataFrame(np.random.rand(5, 5)) ds = copper.Dataset(df1) ds.role[[2, 4]] = ds.TARGET ds.type[[1, 2]] = ds.CATEGORY df2 = pd.DataFrame(np.random.rand(10, 10)) ds.frame = df2 eq_(ds.role[2], ds.INPUT) eq_(ds.role[4], ds.INPUT) eq_(ds.type[1], ds.NUMBER) eq_(ds.type[2], ds.NUMBER)
def test_cat_encode_big(): abc = 'abcdefghijklmnopqrstuvwxyz' array = np.floor(np.random.rand(100000) * 26) strings = np.array([abc[int(i)] for i in array]) ans = copper.t.cat_encode(strings) eq_(len(ans), 100000) eq_(ans.sum(axis=1), np.ones(100000)) eq_(ans.sum(), 100000)
def test_get_set_algorithms(): mc = copper.ModelComparison() lr = LogisticRegression() mc['LR'] = lr eq_(mc['LR'], lr) lr2 = LogisticRegression(penalty='l1') mc['LR l1'] = lr2 eq_(mc['LR l1'], lr2) eq_(len(mc), 2)
def test_save_load_metadata(): tempdir = tempfile.gettempdir() # Save df = pd.DataFrame(np.random.rand(5, 10)) ds = copper.Dataset(df) ds.role[2] = ds.TARGET ds.role[7] = ds.IGNORE ds.type[1] = ds.CATEGORY ds.type[5] = ds.CATEGORY ds.metadata.to_csv(os.path.join(tempdir, 'metadata.csv')) # Load ds2 = copper.Dataset(df) loaded_meta = pd.read_csv(os.path.join(tempdir, 'metadata.csv')) loaded_meta = loaded_meta.set_index('Columns') ds2.metadata = loaded_meta eq_(ds2.role[2], ds.TARGET) eq_(ds2.role[7], ds.IGNORE) eq_(ds2.type[1], ds.CATEGORY) eq_(ds2.type[5], ds.CATEGORY)
def test_deleted_algorithm(): mc = copper.ModelComparison() lr = LogisticRegression() mc['LR'] = lr eq_(mc['LR'], lr) lr2 = LogisticRegression(penalty='l1') mc['LR l1'] = lr2 eq_(mc['LR l1'], lr2) del mc['LR'] eq_(mc['LR l1'], lr2) # Not deleted mc['LR'] # deleted
def test_reproducible(): X, y = get_iris() dbn1 = DBN([5], random_state=123) dbn1.fit(X, y) pred1 = dbn1.predict(X) prob1 = dbn1.predict_proba(X) dbn2 = DBN([5], random_state=123) dbn2.fit(X, y) pred2 = dbn2.predict(X) prob2 = dbn2.predict_proba(X) eq_(dbn1.coef_, dbn2.coef_) eq_(pred1, pred2) eq_(prob1, prob2)
def test_hinge_loss(mc=None): mc = get_mc() if mc is None else mc score = mc.hinge_loss() eq_(score['SVM'], 1.921052, 4) eq_(score['LR'], 2.026315, 4)
def test_create_empty_and_set(): df = pd.DataFrame(np.random.rand(10, 5)) ds = copper.Dataset() eq_(ds.role, pd.Series()) eq_(ds.type, pd.Series()) eq_(ds.metadata.empty, True) eq_(ds.frame.empty, True) ds.frame = df.copy() eq_(ds.frame, df) eq_(len(ds), 10) eq_(len(ds), len(df)) eq_(len(ds.role), 5) eq_(len(ds.type), 5) eq_(len(ds.metadata), 5) eq_(ds.metadata['Role'], ds.role) eq_(ds.metadata['Type'], ds.type) eq_(ds.index, df.index) eq_(ds.columns, df.columns) eq_(str(ds), str(ds.metadata)) eq_(unicode(ds), unicode(ds.metadata))
def test_precision_score(mc=None): mc = get_mc() if mc is None else mc score = mc.precision_score() eq_(score['SVM'], 0.976316, 6) eq_(score['LR'], 0.915414, 6)
def test_tail(): df = pd.DataFrame(np.random.rand(5, 10)) ds = copper.Dataset(df.copy()) l = math.floor(random.random() * 10) eq_(ds.head(l), df.head(l))