def test_partial_fit_regression(self): data = make_regression(100, 2, 1) df = pd.DataFrame(data[0]) df["label"] = data[1] with df.model() as m: fit_partial = m.fit( SkModel( MLPRegressor(max_iter=1, random_state=42), FeaturesAndLabels(features=[0, 1], labels=['label']) ), FittingParameter( naive_splitter(0.3), batch_size=10, fold_epochs=10 ) ) with df.model() as m: fit = m.fit( SkModel( MLPRegressor(max_iter=10, random_state=42), FeaturesAndLabels(features=[0, 1], labels=['label']) ), FittingParameter(naive_splitter(0.3)) ) self.assertAlmostEqual(df.model.predict(fit.model).iloc[0,-1], df.model.predict(fit_partial.model).iloc[0,-1], 4)
def test_regressor(self): """given some toy regression data""" df = pd.DataFrame({ "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0], "b": [-2.0, 1.0, 4.0, 7.0, 10.0, 13.0] }) """and a model""" model = self.provide_regression_model( FeaturesAndLabels(features=["a"], labels=["b"])) """when we fit the model""" batch_size, epochs = self.provide_batch_size_and_epoch() with df.model() as m: fit = m.fit(model, FittingParameter(splitter=naive_splitter(0.3), batch_size=batch_size, epochs=epochs), verbose=0) print(fit.training_summary.df) self.assertEqual(4, len(fit.training_summary.df)) self.assertEqual(2, len(fit.test_summary.df)) """then we can predict""" prediction = df.model.predict(fit.model) np.testing.assert_array_almost_equal(prediction.iloc[:, 0].values, df["b"].values, 1) """and save and load the model""" temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) try: fit.model.save(temp) copy = Model.load(temp) pd.testing.assert_frame_equal(df.model.predict(fit.model), df.model.predict(copy), check_less_precise=True) finally: os.remove(temp)
def test_linear_model(self): df = DF_NOTES.copy() with df.model() as m: fit = m.fit( SkModel( Lasso(), FeaturesAndLabels( features=[ lambda df: df["variance"], lambda df: (df["skewness"] / df["kurtosis"]).rename("engineered") ], labels=[ 'authentic' ] ) ), FittingParameter(naive_splitter()) ) print(fit) prediction = df.model.predict(fit.model) print(prediction) backtest = df.model.backtest(fit.model) self.assertLess(backtest.model.sk_model.coef_[0], 1e-5)
def test_simple_sample_split_multiindex_row(self): def check_test(test_data): self.assertIn("A", test_data[0].x.index) self.assertIn("B", test_data[0].x.index) sampler = Sampler(XYWeight(TEST_MUTLI_INDEX_ROW_DF), splitter=naive_splitter( 0.5, partition_row_multi_index=True), after_fold=check_test, epochs=1) samples = list(sampler.sample_for_training()) self.assertEqual(1, len(samples)) self.assertEqual(4, len(samples[0].x)) self.assertIn("A", samples[0].x.index) self.assertIn("B", samples[0].x.index)
def test_mult_epoch_cross_validation(self): df = pd.DataFrame({ "a": [ 1, 0, 1, 0, 1, 0, 1, 0, ], "b": [ 0, 1, 0, 1, 1, 0, 1, 0, ], }) with df.model() as m: class NN(PytorchNN): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.nn = nn.Sequential( nn.Linear(1, 2), nn.ReLU(), nn.Linear(2, 1), ) def forward_training(self, x): return self.nn(x) fit = m.fit( PytorchModel(NN, FeaturesAndLabels(["a"], ["b"]), nn.MSELoss, Adam), FittingParameter(splitter=naive_splitter(0.5), epochs=2, fold_epochs=10, batch_size=2)) print(fit)
def test_no_test_data(self): """given some toy regression data""" df = pd.DataFrame({ "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0], "b": [-2.0, 1.0, 4.0, 7.0, 10.0, 13.0] }) """and a model""" model = self.provide_regression_model( FeaturesAndLabels(features=["a"], labels=["b"])) """when we fit the model""" batch_size, epochs = self.provide_batch_size_and_epoch() with df.model() as m: fit = m.fit(model, FittingParameter(splitter=naive_splitter(0), batch_size=batch_size, epochs=epochs), verbose=0) # print(fit.training_summary.df) print(fit.test_summary.df) """then we have an empty test data frame""" self.assertEqual(len(fit.training_summary.df), len(df)) self.assertEqual(len(fit.test_summary.df), 0)
def test_multi_sample_regressor(self): """given some toy regression data""" df = pd.DataFrame({ "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0], "b": [-2.0, 1.0, 4.0, 7.0, 10.0, 13.0] }) """and a model""" model = self.provide_regression_model( FeaturesAndLabels(features=["a"], labels=["b"])) """when we fit the model""" batch_size, epochs = self.provide_batch_size_and_epoch() with df.model() as m: fit = m.fit(model, FittingParameter(splitter=naive_splitter(0.3), batch_size=batch_size, epochs=epochs), verbose=0) print(fit.training_summary.df) """then we can predict""" prediction = df.model.predict(fit.model, samples=2) np.testing.assert_array_almost_equal( prediction.iloc[:, 0]._.values, np.concatenate([df[["b"]].values, df[["b"]].values], axis=1), 1)
def test_auto_encoder(self): """given the implementation can handle auto encoders""" model = self.provide_auto_encoder_model( FeaturesAndLabels(features=["a", "b"], labels=["a", "b"], latent=["x"])) if model is None: return """and some toy classification data""" df = pd.DataFrame({ "a": [1, 0] * 10, "b": [0, 1] * 10, }) """when we fit the model""" batch_size, epochs = self.provide_batch_size_and_epoch() with df.model() as m: fit = m.fit(model, FittingParameter(splitter=naive_splitter(0.49), batch_size=batch_size, epochs=epochs), verbose=0) print(fit.training_summary.df) """then we can predict Autoencoded""" auto_encoded_prediction = df.model.predict(fit.model) self.assertEqual((20, 2), auto_encoded_prediction["prediction"].shape) """and we can encode""" encoded_prediction = df.model.predict(fit.model.as_encoder()) print(encoded_prediction) self.assertEqual((20, 1), encoded_prediction["prediction"].shape) """and we can decode""" decoded_prediction = encoded_prediction["prediction"].model.predict( fit.model.as_decoder()) print(decoded_prediction) np.testing.assert_array_almost_equal( decoded_prediction["prediction"].values > 0.5, df[["a", "b"]].values) """and we can encoder and decode after safe and load""" temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) try: fit.model.save(temp) copy = Model.load(temp) pd.testing.assert_frame_equal(df.model.predict( fit.model.as_encoder()), df.model.predict(copy.as_encoder()), check_less_precise=True) pd.testing.assert_frame_equal( encoded_prediction.model.predict(fit.model.as_decoder()), encoded_prediction.model.predict(copy.as_decoder()), check_less_precise=True) finally: os.remove(temp) # try to save only as encoder model try: fit.model.as_encoder().save(temp) copy = Model.load(temp) finally: os.remove(temp)
def test_classifier(self): """given some toy classification data""" df = pd.DataFrame({ "a": [ 1, 0, 1, 0, 1, 0, 1, 0, ], "b": [ 0, 0, 1, 1, 0, 0, 1, 1, ], "c": [ 1, 0, 0, 1, 1, 0, 0, 1, ] }) """and a model""" model = self.provide_classification_model( FeaturesAndLabels(features=["a", "b"], labels=["c"], label_type=int)) temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) """when we fit the model""" batch_size, epochs = self.provide_batch_size_and_epoch() with df.model(temp) as m: fit = m.fit(model, FittingParameter(splitter=naive_splitter(0.49), batch_size=batch_size, epochs=epochs), verbose=0) print(fit.training_summary.df) # fit.training_summary.df.to_pickle('/tmp/classifier.df') # print(fit._repr_html_()) """then we get a html summary and can predict""" self.assertIn('<style>', fit.training_summary._repr_html_()) prediction = df.model.predict(fit.model) binary_prediction = prediction.iloc[:, 0] >= 0.5 np.testing.assert_array_equal( binary_prediction, np.array([ True, False, False, True, True, False, False, True, ])) """and load the model""" try: copy = Model.load(temp) pd.testing.assert_frame_equal(df.model.predict(fit.model), df.model.predict(copy), check_less_precise=True) # test using context manager and ForecastProvider pd.testing.assert_frame_equal( df.model(temp).predict(forecast_provider=Forecast).df, df.model.predict(copy), check_less_precise=True) finally: os.remove(temp)
def test_probabilistic_model_with_callback(self): try: pandas_ml_quant_data_provider = importlib.import_module( "pandas_ml_quant") from pandas_ml_quant import PricePredictionSummary from pandas_ml_quant.model.summary.price_prediction_summary import PriceSampledSummary except: print("pandas_ml_quant not found, skipping!") return df = pd.DataFrame({ "Returns": np.random.normal(-0.02, 0.03, 500) + np.random.normal(0.03, 0.02, 500) }) fl = PostProcessedFeaturesAndLabels( features=["Returns"], feature_post_processor=lambda df: df.ta.rnn(20), labels=[ lambda df: df["Returns"].shift(-1).rename("Future_Returns") ], targets=lambda df: (1 + df["Returns"]).cumprod().rename("Close")) model_factory = PytorchNNFactory.create( nn.Sequential( nn.Linear(20, 10), nn.Tanh(), nn.Linear(10, 6), LambdaSplitter( lambda x: T.softmax(x[..., :2], dim=1), lambda x: T.exp(x[..., 2:4]), # enforce one mean positive and the other negativ lambda x: T.cat([T.exp(x[..., 4:5]), -T.exp(x[..., 5:6])], dim=1), )), predictor=lambda n, i: T.cat(n(i), dim=1), trainer=lambda n, i: n(i)) def dist(probs, scales, locs): return MixtureSameFamily(Categorical(probs=probs), Normal(loc=locs, scale=scales)) def loss(y_pred): probs, scales, locs = y_pred return dist(probs, scales, locs) def cdf_cb(arg): probs, scales, locs = arg[..., :2], arg[..., 2:4], arg[..., 4:6] return dist(probs, scales, locs) summary_provider = PriceSampledSummary.with_reconstructor( sampler=wrap_applyable(lambda params, samples: cdf_cb(params). sample([int(samples.item())]), nr_args=2), samples=100, confidence=0.8) model = PytorchModel(module_provider=model_factory, features_and_labels=fl, criterion_provider=lambda: DistributionNLL( loss, penalize_toal_variance_lambda=1.1), optimizer_provider=Adam, summary_provider=summary_provider) fit = df.model.fit( model, FittingParameter(epochs=10, batch_size=6, splitter=naive_splitter(0.25)), #verbose=1, callbacks=[ TestConfidenceInterval( TestConfidenceInterval.CdfConfidenceInterval( wrap_applyable( lambda params, val: cdf_cb(params).cdf(val), nr_args=2), interval=0.8), wrap_applyable(lambda params: cdf_cb(params).variance), early_stopping=True) ]) print(fit.test_summary.calc_scores())
def feature_selection( self, features_and_labels: FeaturesAndLabels, training_data_splitter: Callable = naive_splitter(0.2), correlated_features_th: float = 0.75, rfecv_splits: int = 4, forest_splits: int = 7, min_features_to_select: int = 1, is_time_series: bool = False, **kwargs): assert features_and_labels.label_type in ('regression', 'classification', int, float, bool), \ "label_type need to be specified: 'regression' | 'classification' !" # find best parameters with self() as m: # extract features and labels ext = m.extract(features_and_labels) # first perform a correlation analysis and remove correlating features !!! if correlated_features_th > 0: _, pairs = get_correlation_pairs(ext.features) redundant_correlated_features = { i[0]: p for i, p in pairs.items() if p > correlated_features_th } _log.warning( f"drop redundant features: {redundant_correlated_features}" ) features_and_labels = PostProcessedFeaturesAndLabels.from_features_and_labels( features_and_labels, feature_post_processor=lambda df: df.drop( redundant_correlated_features.keys(), axis=1)) # estimate model type and sample properties is_classification = 'float' not in (str( features_and_labels.label_type)) nr_samples = len(self.df) if is_classification: nr_classes = len(ext.labels.value_counts()) else: nr_classes = max(len(self.df) / 3, 100) # estimate grid search parameters grid = { "estimator__n_estimators": sp_randint(10, 500), "estimator__max_depth": [2, None], "estimator__min_samples_split": sp_randint(2, nr_samples / nr_classes), "estimator__min_samples_leaf": sp_randint(2, nr_samples / nr_classes), "estimator__bootstrap": [True, False], "estimator__criterion": ["gini", "entropy"] if is_classification else ["mse", "mae"] } # build model cross_validation = TimeSeriesSplit if is_time_series else StratifiedKFold if is_classification else KFold estimator = RandomForestClassifier( ) if is_classification else RandomForestRegressor() selector = RFECV(estimator, step=1, cv=cross_validation(rfecv_splits), min_features_to_select=min_features_to_select) skm = RandomizedSearchCV(selector, param_distributions=grid, cv=cross_validation(forest_splits), n_jobs=-1) # fit model fit = m.fit( SkModel(skm, features_and_labels=features_and_labels, summary_provider=FeatureSelectionSummary), FittingParameter(splitter=training_data_splitter)) # we hide the loss plot from this summary return fit.with_hidden_loss_plot()