def test_binary_classification(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') df["sma"] = SMA(df["spy_Close"]) df["is_above"] = (df["spy_Close"] / df["sma"]) > 1 model = pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), pdu.FeaturesAndLabels(features=['vix_Close'], labels=["is_above"], targets=lambda frame: frame["sma"], gross_loss=lambda frame: frame["spy_Close"] - frame["sma"])) """when""" fit = df.fit(model, test_size=0.4, test_validate_split_seed=42) fit_summary_df = fit.training_summary.df bt_summary_df = df.backtest(fit.model).df predict_df = df.predict(fit.model, tail=1) """then""" self.assertListEqual(fit_summary_df.columns.tolist(), [(PREDICTION_COLUMN_NAME, 'is_above'), (LABEL_COLUMN_NAME, 'is_above'), (GROSS_LOSS_COLUMN_NAME, GROSS_LOSS_COLUMN_NAME), (TARGET_COLUMN_NAME, 'sma')]) self.assertEqual(len(fit_summary_df), 4023) self.assertEqual(bt_summary_df.shape, (6706, 15)) self.assertListEqual(predict_df.columns.tolist(), [(PREDICTION_COLUMN_NAME, 'is_above'), (TARGET_COLUMN_NAME, 'sma')])
def test_binary_classification_kfold(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') df['label'] = df["spy_Close"] > df["spy_Open"] """and cross validation""" called = False cv = KFold(2) def split(x, y=None, group=None): nonlocal called called = True return cv.split(x, y, group) """when""" fit = df.fit(pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), alpha=0.001, random_state=42), pdu.FeaturesAndLabels( features=['vix_Close'], labels=['label'], gross_loss=lambda df: df["spy_Close"] - df["spy_Open"])), test_size=0.4, cross_validation=(1, split), test_validate_split_seed=42) """then""" # no exception thrown self.assertTrue(called)
def test__inderval_index_encoder(self): """given features and labels""" df["sma_ratio"] = df["Close"].ta_future_pct_of_mean(0, 20) df["forward_sma_ratio"] = df["Close"].ta_future_pct_of_mean(3, 20) """and an IntervalIndex""" buckets = pd.IntervalIndex.from_breaks( [-float("inf"), -0.05, 0.0, 0.05, float("inf")]) """and a model""" model = pmu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), pmu.FeaturesAndLabels(features=['sma_ratio'], labels=IntervalIndexEncoder( "forward_sma_ratio", buckets))) """when""" fit = df.fit( model, test_size=0.4, test_validate_split_seed=42, ) predicted = df.predict(fit.model) """then""" print(predicted) self.assertTrue(df.predict(fit.model).values[-1, -1] < 0.1)
def test_extractor(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') """when""" extractor = df.features_and_label_extractor( pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), alpha=0.001, random_state=42), pdu.FeaturesAndLabels(features=['feature'], labels=['label'], gross_loss=lambda df: df["spy_Close"] - df["spy_Open"], targets=lambda df: df["spy_Close"], pre_processor=lambda _df: pdu.LazyDataFrame( _df, feature=lambda f: f["vix_Close"].rolling(2).mean(), label=lambda f: (f["spy_Close"].shift(1) > f["spy_Open"]).shift(-1))))) fnl = extractor.features_labels_weights_df """then""" self.assertListEqual(extractor.features_df.columns.tolist(), ['feature']) self.assertListEqual(extractor.labels_df.columns.tolist(), ['label']) self.assertListEqual(extractor.source_df[SOURCE_COLUMN_NAME].columns.tolist(), [*df.columns.tolist(), 'feature', 'label']) self.assertListEqual(extractor.target_df.columns.tolist(), [(TARGET_COLUMN_NAME, "spy_Close")]) self.assertListEqual(extractor.gross_loss_df.columns.tolist(), [(GROSS_LOSS_COLUMN_NAME, GROSS_LOSS_COLUMN_NAME)]) self.assertEqual(fnl[0].shape, (6704, 1)) self.assertEqual(fnl[1].shape, (6704, 1)) self.assertIsNone(fnl[2])
def test_pre_processor(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') """when""" fit = df.fit( pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), alpha=0.001, random_state=42), pdu.FeaturesAndLabels(features=['feature'], labels=['label'], label_type=int, gross_loss=lambda df: df["spy_Close"] - df["spy_Open"], pre_processor=lambda _df: pdu.LazyDataFrame( _df, feature=lambda f: f["vix_Close"].rolling(2).mean(), label=lambda f: (f["spy_Close"].shift(1) > f["spy_Open"]).shift(-1)).to_dataframe())), test_size=0.4, test_validate_split_seed=42) bt = df.backtest(fit.model) p = df.predict(fit.model, 2) """then fit""" self.assertListEqual(fit.test_summary.df.columns.tolist(), [(PREDICTION_COLUMN_NAME, 'label'), (LABEL_COLUMN_NAME, 'label'), (GROSS_LOSS_COLUMN_NAME, GROSS_LOSS_COLUMN_NAME)]) """ and backtest""" self.assertListEqual(bt.df.columns.tolist(), [(PREDICTION_COLUMN_NAME, 'label'), (LABEL_COLUMN_NAME, 'label'), (GROSS_LOSS_COLUMN_NAME, GROSS_LOSS_COLUMN_NAME), *[(SOURCE_COLUMN_NAME, c) for c in df.columns], (SOURCE_COLUMN_NAME, "feature"), (SOURCE_COLUMN_NAME, "label")]) self.assertEqual(bt.df.index[-1], "2019-09-13") """ and prediction""" self.assertListEqual(p.columns.tolist(), [(PREDICTION_COLUMN_NAME, 'label')]) self.assertEqual(p.index[-1], "2019-09-16") self.assertEqual(len(p), 2)
def test_hyper_parameter(self): from hyperopt import hp """given""" df = pd.read_csv(TEST_FILE, index_col='Date') df['label'] = df["spy_Close"] > df["spy_Open"] """when fit with find hyper parameter""" fit = df.fit(pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), pdu.FeaturesAndLabels(features=['vix_Close'], labels=['label'], target_columns=["vix_Open"], loss_column="spy_Volume")), test_size=0.4, test_validate_split_seed=42, hyper_parameter_space={ 'alpha': hp.choice('alpha', [0.0001, 10]), 'early_stopping': True, 'max_iter': 50, '__max_evals': 4, '__rstate': np.random.RandomState(42) }) """then test best parameter""" self.assertEqual(fit.model.skit_model.get_params()['alpha'], 0.0001)
def test_target_classification(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') df["sma"] = SMA(df["spy_Close"]) df["is_above_1.0"] = (df["spy_Close"] / df["sma"]) > 1 df["is_above_1.2"] = (df["spy_Close"] / df["sma"]) > 1.2 model = pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), pdu.FeaturesAndLabels(features=['vix_Close'], labels={"a": ["is_above_1.0"], "b": ["is_above_1.2"]})) """when""" fit = df.fit(model, test_size=0.4, test_validate_split_seed=42) fit_summary_df = fit.training_summary.df bt_summary_df = df.backtest(fit.model).df predict_df = df.predict(fit.model, tail=1) """then""" self.assertListEqual(fit_summary_df.columns.tolist(), [('a', PREDICTION_COLUMN_NAME, 'is_above_1.0'), ('b', PREDICTION_COLUMN_NAME, 'is_above_1.2'), ('a', LABEL_COLUMN_NAME, 'is_above_1.0'), ('b', LABEL_COLUMN_NAME, 'is_above_1.2')]) self.assertListEqual(predict_df.columns.tolist(), [('a', PREDICTION_COLUMN_NAME, 'is_above_1.0'), ('b', PREDICTION_COLUMN_NAME, 'is_above_1.2')]) self.assertEqual(bt_summary_df.shape, (6706, 16))
def test_invalid_multi_model(self): """expect""" self.assertRaises( ValueError, lambda: pdu.MultiModel( pdu.MultiModel( pdu.SkModel(MLPClassifier(), pdu.FeaturesAndLabels([], {})) )))
def test_save_load_models(self): """given""" features_and_labels = pmu.FeaturesAndLabels(["a"], ["b"]) def keras_model_provider(optimizer='adam'): model = Sequential() model.add(Dense(1, input_dim=1, activation='sigmoid')) model.compile(optimizer, loss='mse') return model providers = [ pmu.SkModel(MLPClassifier(activation='tanh', hidden_layer_sizes=(1, 1), alpha=0.001, random_state=42), features_and_labels, foo='bar'), pmu.SkModel(LogisticRegression(), features_and_labels), pmu.SkModel(LinearSVC(), features_and_labels), pmu.SkModel(RandomForestClassifier(), features_and_labels), pmu.KerasModel(keras_model_provider, features_and_labels), pmu.MultiModel( pmu.SkModel(LogisticRegression(), pmu.FeaturesAndLabels(["a"], {"b": ["b"]}))) ] """when""" fits = [df.fit(mp) for mp in providers] models = [] for i, f in enumerate(fits): f.save_model(f'/tmp/pandas-ml-utils-unittest-test_model_{i}') models.append( (f.model, pmu.Model.load( f'/tmp/pandas-ml-utils-unittest-test_model_{i}'))) """then""" for i, (fitted_model, restored_model) in enumerate(models): print(f"test model ==> {i}") pd.testing.assert_frame_equal(df.predict(fitted_model), df.predict(restored_model)) pd.testing.assert_frame_equal( df.backtest(fitted_model).df, df.backtest(restored_model).df)
def test_binary_classification_summary(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') df['label'] = df["spy_Close"] > df["spy_Open"] """when""" fit = df.fit(pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), alpha=0.001, random_state=42), pdu.FeaturesAndLabels( features=['vix_Close'], labels=['label'], gross_loss=lambda df: df["spy_Close"] - df["spy_Open"]), BinaryClassificationSummary), test_size=0.4, test_validate_split_seed=42) """then confusion matrix""" self.assertEqual(fit.model.features_and_labels.min_required_samples, 1) np.testing.assert_array_equal( fit.training_summary.get_confusion_matrix(), np.array([[1067, 872], [1002, 1082]])) np.testing.assert_array_equal(fit.test_summary.get_confusion_matrix(), np.array([[744, 586], [655, 698]])) """ and confusion loss""" np.testing.assert_array_almost_equal( fit.test_summary.get_confusion_loss(), np.array([[374.90, -234.83], [561.48, -650.63]]), 2) """ and ratios""" np.testing.assert_array_almost_equal( np.array(fit.test_summary.get_ratios()), np.array((0.78, 0.88)), 2) """ and metrics""" np.testing.assert_array_almost_equal( np.array(list(fit.test_summary.get_metrics().values())), np.array([0.78, 0.88, 0.54]), 2) """ and plot_classification""" self.assertDictEqual( { k: repr(v) for k, v in fit.test_summary.plot_classification().items() }, {None: '<Figure size 1600x900 with 2 Axes>'}) if os.environ.get('USER') == 'kic': # FIXME nowadays this test fails on github while it still passes locally. we need a better assertion self.assertDictEqual( { k: len(fig_to_png_base64(v)) for k, v in fit.test_summary.plot_classification().items() }, {None: 141863}) """ and _repr_html_""" self.assertEqual(len(fit.test_summary._repr_html_()), 145167)
def test_multi_model_binary_classifications(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') df["sma"] = SMA(df["spy_Close"]) df["is_above_1.0"] = (df["spy_Close"] / df["sma"]) > 1 df["is_above_1.2"] = (df["spy_Close"] / df["sma"]) > 1.2 model = pdu.MultiModel( pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), pdu.FeaturesAndLabels( features=['vix_Close'], labels={ "a": ["is_above_1.0"], "b": ["is_above_1.2"] }, targets=lambda frame, t: frame["sma"].rename(f"sma {t}"), gross_loss=lambda frame: frame["spy_Close"] - frame["sma"]) )) """when""" fit = df.fit( model, test_size=0.4, test_validate_split_seed=42, ) fit_summary_df = fit.training_summary.df bt_summary_df = df.backtest(fit.model).df predict_df = df.predict(fit.model, tail=1) """then""" self.assertListEqual(fit_summary_df.columns.tolist(), [('a', PREDICTION_COLUMN_NAME, 'is_above_1.0'), ('b', PREDICTION_COLUMN_NAME, 'is_above_1.2'), ('a', LABEL_COLUMN_NAME, 'is_above_1.0'), ('b', LABEL_COLUMN_NAME, 'is_above_1.2'), ('a', GROSS_LOSS_COLUMN_NAME, 'a'), ('b', GROSS_LOSS_COLUMN_NAME, 'b'), ('a', TARGET_COLUMN_NAME, 'sma a'), ('b', TARGET_COLUMN_NAME, 'sma b')]) self.assertListEqual(predict_df.columns.tolist(), [('a', PREDICTION_COLUMN_NAME, 'is_above_1.0'), ('b', PREDICTION_COLUMN_NAME, 'is_above_1.2'), ('a', TARGET_COLUMN_NAME, 'sma a'), ('b', TARGET_COLUMN_NAME, 'sma b')]) self.assertEqual(bt_summary_df.shape, (6706, 20))
def test_model_with_LazyDataFrame_copy(self): """given""" model = pmu.SkModel(MLPClassifier(activation='tanh', hidden_layer_sizes=(1, 1), alpha=0.001, random_state=42), pmu.FeaturesAndLabels([], []), foo='bar', ldf=LazyDataFrame(None, foo=lambda _f: 'bar')) """when""" model.save(f'/tmp/pandas-ml-utils-unittest-test_model_LDF') model2 = pmu.Model.load( f'/tmp/pandas-ml-utils-unittest-test_model_LDF') """then""" self.assertEqual(model.kwargs["ldf"], model2.kwargs["ldf"]) self.assertEqual(model.kwargs["ldf"].kwargs['foo'](None), 'bar') self.assertEqual(model2.kwargs["ldf"].kwargs['foo'](None), 'bar')
def test_fit_and_co(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date').tail(100) ldf = pdu.LazyDataFrame(df, sma=lambda f: f["vix_Close"].rolling(2).mean(), label=lambda f: f["spy_Close"] > f["spy_Open"]) model = pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), pdu.FeaturesAndLabels(["sma"], ["label"])) """when""" fit = ldf.fit(model) bt = ldf.backtest(fit.model) p = ldf.predict(fit.model) """then""" self.assertEqual(len(fit.test_summary.df), 40) self.assertEqual(len(bt.df), 100 - 1) self.assertEqual(len(p), 100 - 1)
def test_discrete_encoded_classes(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') df["sma"] = SMA(df["spy_Close"]) df["label"] = (((df["spy_Close"] / df["sma"] -1) > 0.02).astype(int) - ((df["spy_Close"] / df["sma"] -1) < -0.02).astype(int)) + 1 model = pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), pdu.FeaturesAndLabels(features=['vix_Close'], labels=OneHotEncodedDiscrete("label", 3))) """when""" fit = df.fit(model, test_size=0.4, test_validate_split_seed=42,) predict_df = df.predict(fit.model, tail=1) """then""" self.assertListEqual(predict_df.columns.tolist(), [(PREDICTION_COLUMN_NAME, 'label_0'), (PREDICTION_COLUMN_NAME, 'label_1'), (PREDICTION_COLUMN_NAME, 'label_2')])
def test_fit_regressor_mutiple_target(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') / 50. """when""" fit = df.fit(pdu.SkModel( MLPRegressor(activation='tanh', hidden_layer_sizes=(4, 3, 2, 1, 2, 3, 4), random_state=42), pdu.FeaturesAndLabels( features=['spy_Open', 'spy_High', 'spy_Low', 'spy_Close'], labels={ "a": ['vix_Open'], "b": ['vix_High', 'vix_Low', 'vix_Close'] }, targets=lambda frame, t: frame[['vix_High', 'vix_Low'] ].add_prefix(f"{t}_"))), test_size=0.4, test_validate_split_seed=42) fitted_model = fit.model # backtest backtest_regression = df.backtest(fitted_model) self.assertIsNotNone(backtest_regression) # regressed regressed = df.predict(fitted_model) """then""" self.assertListEqual(regressed.columns.tolist(), [('a', 'prediction', 'vix_Open'), ('b', 'prediction', 'vix_High'), ('b', 'prediction', 'vix_Low'), ('b', 'prediction', 'vix_Close'), ('a', 'target', 'a_vix_High'), ('a', 'target', 'a_vix_Low'), ('b', 'target', 'b_vix_High'), ('b', 'target', 'b_vix_Low')]) self.assertEqual(len(regressed), 6706)
def test_multi_class_classification(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') df["sma"] = SMA(df["spy_Close"]) df["label"] = df["spy_Close"] / df["sma"] -1 def make_targets(frame): space = np.array([-1, -0.05, 0.5, 1]) res = frame.apply(lambda x: x["sma"] - space, axis=1, result_type='expand') res.columns = ["close <0.1", "close <0.05", "close >0", "close >0.05"] return res model = pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), pdu.FeaturesAndLabels(features=['vix_Close'], labels=OneHotEncodedTargets("label", np.linspace(-0.1, 0.1, 5, endpoint=True)), targets=make_targets)) """when""" fit = df.fit(model, test_size=0.4, test_validate_split_seed=42,) fit_summary_df = fit.training_summary.df bt_summary_df = df.backtest(fit.model).df predict_df = df.predict(fit.model, tail=1) """then""" self.assertEqual(len(fit_summary_df), 4023) self.assertListEqual(fit_summary_df.columns.tolist(), [(PREDICTION_COLUMN_NAME, '(-inf, -0.05]'), (PREDICTION_COLUMN_NAME, '(-0.05, 0.0]'), (PREDICTION_COLUMN_NAME, '(0.0, 0.05000000000000002]'), (PREDICTION_COLUMN_NAME, '(0.05000000000000002, inf]'), (LABEL_COLUMN_NAME, '(-inf, -0.05]'), (LABEL_COLUMN_NAME, '(-0.05, 0.0]'), (LABEL_COLUMN_NAME, '(0.0, 0.05000000000000002]'), (LABEL_COLUMN_NAME, '(0.05000000000000002, inf]'), (TARGET_COLUMN_NAME, 'close <0.1'), (TARGET_COLUMN_NAME, 'close <0.05'), (TARGET_COLUMN_NAME, 'close >0'), (TARGET_COLUMN_NAME, 'close >0.05')]) self.assertListEqual(predict_df.columns.tolist(), [(PREDICTION_COLUMN_NAME, '(-inf, -0.05]'), (PREDICTION_COLUMN_NAME, '(-0.05, 0.0]'), (PREDICTION_COLUMN_NAME, '(0.0, 0.05000000000000002]'), (PREDICTION_COLUMN_NAME, '(0.05000000000000002, inf]'), (TARGET_COLUMN_NAME, 'close <0.1'), (TARGET_COLUMN_NAME, 'close <0.05'), (TARGET_COLUMN_NAME, 'close >0'), (TARGET_COLUMN_NAME, 'close >0.05')]) self.assertEqual(bt_summary_df.shape, (6706, 23))
def test_multi_model_multi_class_classifications(self): """given""" df = pd.read_csv(TEST_FILE, index_col='Date') df["sma"] = SMA(df["spy_Close"]) df["is_above_1.0"] = (df["spy_Close"] / df["sma"]) + 1 df["is_above_1.2"] = (df["spy_Close"] / df["sma"]) + 2 model = pdu.MultiModel( pdu.SkModel( MLPClassifier(activation='tanh', hidden_layer_sizes=(60, 50), random_state=42), pdu.FeaturesAndLabels( features=['vix_Close'], labels={ "1": OneHotEncodedTargets( "is_above_1.0", np.linspace(-0.1, 0.1, 5, endpoint=True) + 1), "2": OneHotEncodedTargets( "is_above_1.2", np.linspace(-0.1, 0.1, 5, endpoint=True) + 2) }, targets=lambda frame, t: (frame["sma"] + int(t)).rename(f"sma {t}"), gross_loss=lambda frame: frame["spy_Close"] - frame["sma"]) )) """when""" fit = df.fit( model, test_size=0.4, test_validate_split_seed=42, ) fit_summary_df = fit.training_summary.df bt_summary_df = df.backtest(fit.model).df predict_df = df.predict(fit.model, tail=1) """then""" print(fit_summary_df.columns.tolist()) self.assertListEqual(fit_summary_df.columns.tolist(), [('1', 'prediction', '(-inf, 0.95]'), ('1', 'prediction', '(0.95, 1.0]'), ('1', 'prediction', '(1.0, 1.05]'), ('1', 'prediction', '(1.05, inf]'), ('2', 'prediction', '(-inf, 1.95]'), ('2', 'prediction', '(1.95, 2.0]'), ('2', 'prediction', '(2.0, 2.05]'), ('2', 'prediction', '(2.05, inf]'), ('1', 'label', '(-inf, 0.95]'), ('1', 'label', '(0.95, 1.0]'), ('1', 'label', '(1.0, 1.05]'), ('1', 'label', '(1.05, inf]'), ('2', 'label', '(-inf, 1.95]'), ('2', 'label', '(1.95, 2.0]'), ('2', 'label', '(2.0, 2.05]'), ('2', 'label', '(2.05, inf]'), ('1', GROSS_LOSS_COLUMN_NAME, '1'), ('2', GROSS_LOSS_COLUMN_NAME, '2'), ('1', TARGET_COLUMN_NAME, 'sma 1'), ('2', TARGET_COLUMN_NAME, 'sma 2')]) self.assertListEqual(predict_df.columns.tolist(), [('1', 'prediction', '(-inf, 0.95]'), ('1', 'prediction', '(0.95, 1.0]'), ('1', 'prediction', '(1.0, 1.05]'), ('1', 'prediction', '(1.05, inf]'), ('2', 'prediction', '(-inf, 1.95]'), ('2', 'prediction', '(1.95, 2.0]'), ('2', 'prediction', '(2.0, 2.05]'), ('2', 'prediction', '(2.05, inf]'), ('1', TARGET_COLUMN_NAME, 'sma 1'), ('2', TARGET_COLUMN_NAME, 'sma 2')]) self.assertEqual(bt_summary_df.shape, (6706, 32))