def test_data_indexing(): """test data indexing""" d = micro_data() msg = 'error indexing data by era' ade(d['era1'], micro_data([0]), msg) ade(d['era2'], micro_data([1, 2]), msg) ade(d['era3'], micro_data([3, 4, 5]), msg) ade(d['era4'], micro_data([6]), msg) ade(d['eraX'], micro_data([7, 8, 9]), msg) msg = 'error indexing data by region' ade(d['train'], micro_data([0, 1, 2]), msg) ade(d['validation'], micro_data([3, 4, 5, 6]), msg) ade(d['test'], micro_data([7, 8]), msg) ade(d['live'], micro_data([9]), msg) msg = 'error indexing data by array' # TODO # a = d.y['kazutsugi'] # b = d[a] # c = b == 0 # ade(d[d.y['kazutsugi'] == 0], micro_data([0, 2, 4, 6, 8, 9]), msg) # ade(d[d.era == 'era4'], micro_data([6]), msg) assert_raises(IndexError, d.__getitem__, 'era') assert_raises(IndexError, d.__getitem__, 'wtf') assert_raises(IndexError, d.__getitem__, None)
def test_data_loc(): "test data.loc" d = micro_data() msg = 'data.loc indexing error' ade(d.loc[['index1']], micro_data([1]), msg) ade(d.loc[['index4']], micro_data([4]), msg) ade(d.loc[['index4', 'index0']], micro_data([4, 0]), msg) ade(d.loc[['index4', 'index0', 'index2']], micro_data([4, 0, 2]), msg)
def test_data_methods(): "test data methods" d = micro_data() ok_(len(d) == 10, "wrong length") ok_(d.size == 120, "wrong size") ok_(d.shape == (10, 12), "wrong shape") ok_(d == d, "not equal")
def test_data_y_indexing(): "test data y indexing" d = micro_data() msg = 'y arrays not equal' y1 = [0, 1, 0, 1, 0, 1, 0, 1, 0, 0] assert_array_equal(d.y[1], y1, msg) assert_array_equal(d.y['bernie'], y1, msg) y2 = [0, 1, 1, 1, 0, 1, 1, 1, 0, 1] assert_array_equal(d.y[2], y2, msg) assert_array_equal(d.y['elizabeth'], y2, msg) y3 = [1, 1, 1, 0, 0, 1, 0, 1, 0, 0] assert_array_equal(d.y[3], y3, msg) assert_array_equal(d.y['jordan'], y3, msg) y4 = [1, 1, 1, 1, 1, 1, 1, 1, 0, 0] assert_array_equal(d.y[4], y4, msg) assert_array_equal(d.y['ken'], y4, msg) y5 = [0, 0, 1, 0, 0, 0, 1, 1, 0, 1] assert_array_equal(d.y[5], y5, msg) assert_array_equal(d.y['charles'], y5, msg) y = np.vstack([[y1], [y2], [y3], [y4], [y5]]).T assert_array_equal(d.y[:], y, msg) assert_raises(IndexError, d.y.__getitem__, 0) assert_raises(IndexError, d.y.__getitem__, 'era') assert_raises(IndexError, d.y.__getitem__, 'wtf') assert_raises(IndexError, d.y.__getitem__, None) assert_raises(IndexError, d.y.__getitem__, slice(1))
def test_data_roundtrip(): "save/load roundtrip shouldn't change data" d = micro_data() path = None try: path = testing.create_tempfile('numerox.h5') d.save(path) d2 = nx.load_data(path) ade(d, d2, "data corrupted during roundtrip") d.save(path, compress=True) d2 = nx.load_data(path) ade(d, d2, "data corrupted during roundtrip") d = d['live'] d.save(path) d2 = nx.load_data(path) ade(d, d2, "data corrupted during roundtrip") finally: testing.delete_tempfile(path)
def test_data_balance(): "test data.balance" tournament = 1 d = micro_data() # check balance b = d.balance(tournament, train_only=False) for era in b.unique_era(): if era != 'eraX': y = b[era].y[tournament] n0 = (y == 0).sum() n1 = (y == 1).sum() ok_(n0 == n1, "y is not balanced") # check balance b = d.balance(tournament, train_only=True) eras = np.unique(b.era[b.region == 'train']) for era in eras: y = b[era].y[tournament] n0 = (y == 0).sum() n1 = (y == 1).sum() ok_(n0 == n1, "y is not balanced") # balance already balanced data (regression test) d.balance(tournament).balance(tournament)
def test_metrics_per_era(): "make sure calc_metrics runs" d = micro_data() p = micro_prediction() metrics_per_era(d, p) metrics_per_era(d, p, 'yhat') metrics_per_era(d, p, 'inner') assert_raises(ValueError, metrics_per_era, d, p, 'outer')
def test_metrics_per_name(): "make sure metrics_per_name runs" d = testing.micro_data() p = testing.micro_prediction() metrics_per_name(d, p, 1) metrics_per_name(d, p, 2, join='yhat') metrics_per_name(d, p, 3, columns=['sharpe']) assert_raises(ValueError, metrics_per_name, d, p, 4, 'data', ['wtf'])
def test_prediction_roundtrip(): "save/load roundtrip shouldn't change prediction" d = micro_data() m = nx.logistic() p = nx.production(m, d, verbosity=0) with tempfile.NamedTemporaryFile() as temp: p.save(temp.name) p2 = nx.load_prediction(temp.name) ade(p, p2, "prediction corrupted during roundtrip")
def test_data_era_isnotin(): "test data.era_isnotin" d = micro_data() eras = ['era3', 'eraX'] d0 = d.era_isnotin(eras) d1 = d.era_isin(eras) d01 = nx.concat_data([d0, d1]) d01 = d01.loc[d.ids] ade(d01, d, "all rows not selected")
def test_metrics_per_era(): "make sure metrics_per_era runs" d = testing.micro_data() p = testing.micro_prediction() metrics_per_era(d, p, 1) metrics_per_era(d, p, 2, join='yhat') metrics_per_era(d, p, 3, join='inner') assert_raises(ValueError, metrics_per_era, d, p, 4, 'outer') with testing.HiddenPrints(): metrics_per_era(d, p, tournament=5, era_as_str=True)
def test_data_roundtrip(): "Saving and then loading data shouldn't change data" d = micro_data() with tempfile.NamedTemporaryFile() as temp: d.save(temp.name) d2 = load_data(temp.name) ade(d, d2, "data corrupted during roundtrip") d.save(temp.name, compress=True) d2 = load_data(temp.name) ade(d, d2, "data corrupted during roundtrip")
def test_backtest_production(): "Make sure backtest and production run" d = testing.micro_data() model = fifty() with testing.HiddenPrints(): for verbosity in (0, 1, 2, 3): nx.backtest(model, d, kfold=2, verbosity=verbosity) nx.production(model, d, verbosity=verbosity) if verbosity == 3: nx.production(model, d, name='test', verbosity=verbosity)
def test_data_properties(): """data properties should not be corrupted""" d = micro_data() ok_((d.ids == d.df.index).all(), "ids is corrupted") ok_((d.era_float == d.df.era).all(), "era is corrupted") ok_((d.region_float == d.df.region).all(), "region is corrupted") idx = ~pd.isnull(d.y[:]) y = d.df[['kazutsugi']].values
def test_prediction_performance(): "make sure prediction.performance runs" d = testing.micro_data() p = testing.micro_prediction() df = p.performance(d) ok_(isinstance(df, pd.DataFrame), 'expecting a dataframe') p.performance(d, sort_by='auc') p.performance(d, sort_by='acc') p.performance(d, sort_by='ystd') p.performance(d, sort_by='sharpe') p.performance(d, sort_by='consis')
def test_data_properties(): "prediction properties should not be corrupted" d = testing.micro_data() p = nx.Prediction() p = p.merge_arrays(d.ids, d.y, 'model1') p = p.merge_arrays(d.ids, d.y, 'model2') ok_((p.ids == p.df.index).all(), "ids is corrupted") ok_((p.ids == d.df.index).all(), "ids is corrupted") ok_((p.y[:, 0] == d.df.y).all(), "y is corrupted") ok_((p.y[:, 1] == d.df.y).all(), "y is corrupted")
def test_data_copies(): "data properties should be copies" d = micro_data() ok_(shares_memory(d, d), "looks like shares_memory failed") ok_(~shares_memory(d, d.copy()), "should be a copy") ok_(~shares_memory(d, d.ids), "d.ids should be a copy") ok_(~shares_memory(d, d.era), "d.era should be a copy") ok_(~shares_memory(d, d.region), "d.region should be a copy") ok_(~shares_memory(d, d.x), "d.x should be a copy") ok_(~shares_memory(d, d.y), "d.y should be a copy")
def test_splitter_overlap(): "prediction data should not overlap" d = micro_data() splitters = [tournament_splitter(d), validation_splitter(d), cheat_splitter(d), cv_splitter(d, kfold=2), split_splitter(d, fit_fraction=0.5)] for splitter in splitters: predict_ids = [] for dfit, dpredict in splitter: predict_ids.extend(dpredict.ids.tolist()) ok_(len(predict_ids) == len(set(predict_ids)), "ids overlap")
def test_data_era_iter(): "test data.era_iter" d = micro_data() for as_str in (True, False): era2 = [] for era, idx in d.era_iter(as_str=as_str): era2.append(era) n = np.unique(d[idx].era).size ok_(n == 1, "expecting a single era") era = d.unique_era(as_str=as_str).tolist() era.sort() era2.sort() ok_(era2 == era, "era difference found")
def test_data_region_iter(): "test data.region_iter" d = micro_data() for as_str in (True, False): region2 = [] for region, idx in d.region_iter(as_str=as_str): region2.append(region) n = np.unique(d[idx].region).size ok_(n == 1, "expecting a single region") region = d.unique_region(as_str=as_str).tolist() region.sort() region2.sort() ok_(region2 == region, "region difference found")
def test_data_properties(): "data properties should not be corrupted" d = micro_data() ok_((d.ids == d.df.index).all(), "ids is corrupted") ok_((d.era == d.df.era).all(), "era is corrupted") ok_((d.region == d.df.region).all(), "region is corrupted") idx = ~np.isnan(d.df.y) ok_((d.y[idx] == d.df.y[idx]).all(), "y is corrupted") x = d.x for i, name in enumerate(d._x_names()): ok_((x[:, i] == d.df[name]).all(), "%s is corrupted" % name)
def test_data_properties(): "data properties should not be corrupted" d = micro_data() ok_((d.ids == d.df.index).all(), "ids is corrupted") ok_((d.era_float == d.df.era).all(), "era is corrupted") ok_((d.region_float == d.df.region).all(), "region is corrupted") idx = ~np.isnan(d.y[:]) y = d.df[['bernie', 'elizabeth', 'jordan', 'ken', 'charles']].values ok_((d.y[:][idx] == y[idx]).all(), "y is corrupted") x = d.x for i, name in enumerate(d.column_list(x_only=True)): ok_((x[:, i] == d.df[name]).all(), "%s is corrupted" % name)
def test_empty_data(): "test empty data" d = micro_data() d['eraXXX'] d['eraYYY'].__repr__() idx = np.zeros(len(d), dtype=np.bool) d0 = d[idx] ok_(len(d0) == 0, "empty data should have length 0") ok_(d0.size == 0, "empty data should have size 0") ok_(d0.shape[0] == 0, "empty data should have d.shape[0] == 0") ok_(d0.era.size == 0, "empty data should have d.era.size == 0") ok_(d0.region.size == 0, "empty data should have d.region.size == 0") ok_(d0.x.size == 0, "empty data should have d.x.size == 0") ok_(d0.y[:].size == 0, "empty data should have d.y.size == 0") d2 = d['era0'] + d[idx] ok_(len(d2) == 0, "empty data should have length 0")
def test_data_copies(): """data properties should be copies or views""" d = micro_data() ok_(shares_memory(d, d), "looks like shares_memory failed") # copies ok_(not shares_memory(d, d.copy()), "should be a copy") ok_(not shares_memory(d, d.era), "d.era should be a copy") ok_(not shares_memory(d, d.region), "d.region should be a copy") ok_(not shares_memory(d, d.ids), "d.ids should be a copy") # views ok_(shares_memory(d, d.era_float), "d.era_float should be a view") ok_(shares_memory(d, d.region_float), "d.region_float should be a view")
def test_data_roundtrip(): "save/load roundtrip shouldn't change data" d = micro_data() with tempfile.NamedTemporaryFile() as temp: d.save(temp.name) d2 = nx.load_data(temp.name) ade(d, d2, "data corrupted during roundtrip") d.save(temp.name, compress=True) d2 = nx.load_data(temp.name) ade(d, d2, "data corrupted during roundtrip") d = d['live'] d.save(temp.name) d2 = nx.load_data(temp.name) ade(d, d2, "data corrupted during roundtrip")
def test_report_performance_df(): "make sure report.performance_df runs" d = micro_data() d = d['train'] + d['validation'] p = Prediction() p.append(d.ids, d.y) r = Report() r.append_prediction(p, 'model1') r.append_prediction(p, 'model2') r.append_prediction(p, 'model3') df = r.performance_df(d) ok_(isinstance(df, pd.DataFrame), 'expecting a dataframe')
def test_prediction_add(): "add two predictions together" d = micro_data() p1 = Prediction() p2 = Prediction() d1 = d['train'] d2 = d['tournament'] rs = np.random.RandomState(0) yhat1 = 0.2 * (rs.rand(len(d1)) - 0.5) + 0.5 yhat2 = 0.2 * (rs.rand(len(d2)) - 0.5) + 0.5 p1.append(d1.ids, yhat1) p2.append(d2.ids, yhat2) p = p1 + p2 # just make sure that it runs assert_raises(IndexError, p.__add__, p1) assert_raises(IndexError, p1.__add__, p1)
def test_data_y_indexing(): """ test data y indexing only checking kazutsugi """ d = micro_data() msg = 'y arrays not equal' y1 = [0, 1, 0, 1, 0, 1, 0, 0, 0, 0] assert_array_equal(d.y[8], y1, msg) assert_array_equal(d.y['kazutsugi'], y1, msg) assert_raises(IndexError, d.y.__getitem__, 0) assert_raises(IndexError, d.y.__getitem__, 'era') assert_raises(IndexError, d.y.__getitem__, 'wtf') assert_raises(IndexError, d.y.__getitem__, None) assert_raises(IndexError, d.y.__getitem__, slice(1))
def test_prediction_add(): "add two predictions together" d = testing.micro_data() p1 = nx.Prediction() p2 = nx.Prediction() d1 = d['train'] d2 = d['tournament'] rs = np.random.RandomState(0) y1 = 0.2 * (rs.rand(len(d1)) - 0.5) + 0.5 y2 = 0.2 * (rs.rand(len(d2)) - 0.5) + 0.5 p1 = p1.merge_arrays(d1.ids, y1, 'model1') p2 = p2.merge_arrays(d2.ids, y2, 'model1') p = p1 + p2 # just make sure that it runs assert_raises(ValueError, p.__add__, p1) assert_raises(ValueError, p1.__add__, p1)
def test_backtest_production(): "Make sure backtest and production run" d = testing.micro_data() model = nx.fifty() with testing.HiddenPrints(): p = nx.production(model, d) ok_(p.shape[1] == 5, 'wrong number of tournaments') ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments') p = nx.backtest(model, d, kfold=2) ok_(p.shape[1] == 5, 'wrong number of tournaments') ok_(p.tournaments() == nx.tournament_all(), 'wrong tournaments') for verbosity in (0, 1, 2, 3): nx.backtest(model, d, tournament=3, kfold=2, verbosity=verbosity) nx.production(model, d, tournament='ken', verbosity=verbosity) nx.production(model, d, tournament=4, verbosity=verbosity) nx.production(model, d, tournament=None, verbosity=verbosity) if verbosity == 3: nx.production(model, d, tournament=5, verbosity=verbosity) nx.production(model, d, tournament='charles', verbosity=verbosity)