Beispiel #1
0
    def test_partial_fit_regression(self):
        data = make_regression(100, 2, 1)
        df = pd.DataFrame(data[0])
        df["label"] = data[1]

        with df.model() as m:
            fit_partial = m.fit(
                SkModel(
                    MLPRegressor(max_iter=1, random_state=42),
                    FeaturesAndLabels(features=[0, 1], labels=['label'])
                ),
                FittingParameter(
                    naive_splitter(0.3),
                    batch_size=10,
                    fold_epochs=10
                )
            )

        with df.model() as m:
            fit = m.fit(
                SkModel(
                    MLPRegressor(max_iter=10, random_state=42),
                    FeaturesAndLabels(features=[0, 1], labels=['label'])
                ),
                FittingParameter(naive_splitter(0.3))
            )

        self.assertAlmostEqual(df.model.predict(fit.model).iloc[0,-1], df.model.predict(fit_partial.model).iloc[0,-1], 4)
Beispiel #2
0
    def test_regressor(self):
        """given some toy regression data"""
        df = pd.DataFrame({
            "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0],
            "b": [-2.0, 1.0, 4.0, 7.0, 10.0, 13.0]
        })
        """and a model"""
        model = self.provide_regression_model(
            FeaturesAndLabels(features=["a"], labels=["b"]))
        """when we fit the model"""
        batch_size, epochs = self.provide_batch_size_and_epoch()
        with df.model() as m:
            fit = m.fit(model,
                        FittingParameter(splitter=naive_splitter(0.3),
                                         batch_size=batch_size,
                                         epochs=epochs),
                        verbose=0)

        print(fit.training_summary.df)
        self.assertEqual(4, len(fit.training_summary.df))
        self.assertEqual(2, len(fit.test_summary.df))
        """then we can predict"""
        prediction = df.model.predict(fit.model)
        np.testing.assert_array_almost_equal(prediction.iloc[:, 0].values,
                                             df["b"].values, 1)
        """and save and load the model"""
        temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        try:
            fit.model.save(temp)
            copy = Model.load(temp)
            pd.testing.assert_frame_equal(df.model.predict(fit.model),
                                          df.model.predict(copy),
                                          check_less_precise=True)
        finally:
            os.remove(temp)
Beispiel #3
0
    def test_linear_model(self):
        df = DF_NOTES.copy()

        with df.model() as m:
            fit = m.fit(
                SkModel(
                    Lasso(),
                    FeaturesAndLabels(
                        features=[
                            lambda df: df["variance"],
                            lambda df: (df["skewness"] / df["kurtosis"]).rename("engineered")
                        ],
                        labels=[
                            'authentic'
                        ]
                    )
                ),
                FittingParameter(naive_splitter())
            )

        print(fit)

        prediction = df.model.predict(fit.model)
        print(prediction)

        backtest = df.model.backtest(fit.model)
        self.assertLess(backtest.model.sk_model.coef_[0], 1e-5)
Beispiel #4
0
    def test_simple_sample_split_multiindex_row(self):
        def check_test(test_data):
            self.assertIn("A", test_data[0].x.index)
            self.assertIn("B", test_data[0].x.index)

        sampler = Sampler(XYWeight(TEST_MUTLI_INDEX_ROW_DF),
                          splitter=naive_splitter(
                              0.5, partition_row_multi_index=True),
                          after_fold=check_test,
                          epochs=1)

        samples = list(sampler.sample_for_training())
        self.assertEqual(1, len(samples))
        self.assertEqual(4, len(samples[0].x))
        self.assertIn("A", samples[0].x.index)
        self.assertIn("B", samples[0].x.index)
    def test_mult_epoch_cross_validation(self):
        df = pd.DataFrame({
            "a": [
                1,
                0,
                1,
                0,
                1,
                0,
                1,
                0,
            ],
            "b": [
                0,
                1,
                0,
                1,
                1,
                0,
                1,
                0,
            ],
        })

        with df.model() as m:

            class NN(PytorchNN):
                def __init__(self, *args, **kwargs):
                    super().__init__(*args, **kwargs)
                    self.nn = nn.Sequential(
                        nn.Linear(1, 2),
                        nn.ReLU(),
                        nn.Linear(2, 1),
                    )

                def forward_training(self, x):
                    return self.nn(x)

            fit = m.fit(
                PytorchModel(NN, FeaturesAndLabels(["a"], ["b"]), nn.MSELoss,
                             Adam),
                FittingParameter(splitter=naive_splitter(0.5),
                                 epochs=2,
                                 fold_epochs=10,
                                 batch_size=2))

        print(fit)
Beispiel #6
0
    def test_no_test_data(self):
        """given some toy regression data"""
        df = pd.DataFrame({
            "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0],
            "b": [-2.0, 1.0, 4.0, 7.0, 10.0, 13.0]
        })
        """and a model"""
        model = self.provide_regression_model(
            FeaturesAndLabels(features=["a"], labels=["b"]))
        """when we fit the model"""
        batch_size, epochs = self.provide_batch_size_and_epoch()
        with df.model() as m:
            fit = m.fit(model,
                        FittingParameter(splitter=naive_splitter(0),
                                         batch_size=batch_size,
                                         epochs=epochs),
                        verbose=0)

        # print(fit.training_summary.df)
        print(fit.test_summary.df)
        """then we have an empty test data frame"""
        self.assertEqual(len(fit.training_summary.df), len(df))
        self.assertEqual(len(fit.test_summary.df), 0)
Beispiel #7
0
    def test_multi_sample_regressor(self):
        """given some toy regression data"""
        df = pd.DataFrame({
            "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0],
            "b": [-2.0, 1.0, 4.0, 7.0, 10.0, 13.0]
        })
        """and a model"""
        model = self.provide_regression_model(
            FeaturesAndLabels(features=["a"], labels=["b"]))
        """when we fit the model"""
        batch_size, epochs = self.provide_batch_size_and_epoch()
        with df.model() as m:
            fit = m.fit(model,
                        FittingParameter(splitter=naive_splitter(0.3),
                                         batch_size=batch_size,
                                         epochs=epochs),
                        verbose=0)

        print(fit.training_summary.df)
        """then we can predict"""
        prediction = df.model.predict(fit.model, samples=2)
        np.testing.assert_array_almost_equal(
            prediction.iloc[:, 0]._.values,
            np.concatenate([df[["b"]].values, df[["b"]].values], axis=1), 1)
Beispiel #8
0
    def test_auto_encoder(self):
        """given the implementation can handle auto encoders"""
        model = self.provide_auto_encoder_model(
            FeaturesAndLabels(features=["a", "b"],
                              labels=["a", "b"],
                              latent=["x"]))

        if model is None:
            return
        """and some toy classification data"""
        df = pd.DataFrame({
            "a": [1, 0] * 10,
            "b": [0, 1] * 10,
        })
        """when we fit the model"""
        batch_size, epochs = self.provide_batch_size_and_epoch()
        with df.model() as m:
            fit = m.fit(model,
                        FittingParameter(splitter=naive_splitter(0.49),
                                         batch_size=batch_size,
                                         epochs=epochs),
                        verbose=0)

        print(fit.training_summary.df)
        """then we can predict Autoencoded"""
        auto_encoded_prediction = df.model.predict(fit.model)
        self.assertEqual((20, 2), auto_encoded_prediction["prediction"].shape)
        """and we can encode"""
        encoded_prediction = df.model.predict(fit.model.as_encoder())
        print(encoded_prediction)
        self.assertEqual((20, 1), encoded_prediction["prediction"].shape)
        """and we can decode"""
        decoded_prediction = encoded_prediction["prediction"].model.predict(
            fit.model.as_decoder())
        print(decoded_prediction)
        np.testing.assert_array_almost_equal(
            decoded_prediction["prediction"].values > 0.5, df[["a",
                                                               "b"]].values)
        """and we can encoder and decode after safe and load"""
        temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        try:
            fit.model.save(temp)
            copy = Model.load(temp)

            pd.testing.assert_frame_equal(df.model.predict(
                fit.model.as_encoder()),
                                          df.model.predict(copy.as_encoder()),
                                          check_less_precise=True)

            pd.testing.assert_frame_equal(
                encoded_prediction.model.predict(fit.model.as_decoder()),
                encoded_prediction.model.predict(copy.as_decoder()),
                check_less_precise=True)
        finally:
            os.remove(temp)

        # try to save only as encoder model
        try:
            fit.model.as_encoder().save(temp)
            copy = Model.load(temp)
        finally:
            os.remove(temp)
Beispiel #9
0
    def test_classifier(self):
        """given some toy classification data"""
        df = pd.DataFrame({
            "a": [
                1,
                0,
                1,
                0,
                1,
                0,
                1,
                0,
            ],
            "b": [
                0,
                0,
                1,
                1,
                0,
                0,
                1,
                1,
            ],
            "c": [
                1,
                0,
                0,
                1,
                1,
                0,
                0,
                1,
            ]
        })
        """and a model"""
        model = self.provide_classification_model(
            FeaturesAndLabels(features=["a", "b"],
                              labels=["c"],
                              label_type=int))
        temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        """when we fit the model"""
        batch_size, epochs = self.provide_batch_size_and_epoch()
        with df.model(temp) as m:
            fit = m.fit(model,
                        FittingParameter(splitter=naive_splitter(0.49),
                                         batch_size=batch_size,
                                         epochs=epochs),
                        verbose=0)

        print(fit.training_summary.df)
        # fit.training_summary.df.to_pickle('/tmp/classifier.df')
        # print(fit._repr_html_())
        """then we get a html summary and can predict"""
        self.assertIn('<style>', fit.training_summary._repr_html_())

        prediction = df.model.predict(fit.model)
        binary_prediction = prediction.iloc[:, 0] >= 0.5
        np.testing.assert_array_equal(
            binary_prediction,
            np.array([
                True,
                False,
                False,
                True,
                True,
                False,
                False,
                True,
            ]))
        """and load the model"""
        try:
            copy = Model.load(temp)
            pd.testing.assert_frame_equal(df.model.predict(fit.model),
                                          df.model.predict(copy),
                                          check_less_precise=True)

            # test using context manager and ForecastProvider
            pd.testing.assert_frame_equal(
                df.model(temp).predict(forecast_provider=Forecast).df,
                df.model.predict(copy),
                check_less_precise=True)
        finally:
            os.remove(temp)
    def test_probabilistic_model_with_callback(self):
        try:
            pandas_ml_quant_data_provider = importlib.import_module(
                "pandas_ml_quant")
            from pandas_ml_quant import PricePredictionSummary
            from pandas_ml_quant.model.summary.price_prediction_summary import PriceSampledSummary
        except:
            print("pandas_ml_quant not found, skipping!")
            return

        df = pd.DataFrame({
            "Returns":
            np.random.normal(-0.02, 0.03, 500) +
            np.random.normal(0.03, 0.02, 500)
        })

        fl = PostProcessedFeaturesAndLabels(
            features=["Returns"],
            feature_post_processor=lambda df: df.ta.rnn(20),
            labels=[
                lambda df: df["Returns"].shift(-1).rename("Future_Returns")
            ],
            targets=lambda df: (1 + df["Returns"]).cumprod().rename("Close"))

        model_factory = PytorchNNFactory.create(
            nn.Sequential(
                nn.Linear(20, 10),
                nn.Tanh(),
                nn.Linear(10, 6),
                LambdaSplitter(
                    lambda x: T.softmax(x[..., :2], dim=1),
                    lambda x: T.exp(x[..., 2:4]),
                    # enforce one mean positive and the other negativ
                    lambda x: T.cat([T.exp(x[..., 4:5]), -T.exp(x[..., 5:6])],
                                    dim=1),
                )),
            predictor=lambda n, i: T.cat(n(i), dim=1),
            trainer=lambda n, i: n(i))

        def dist(probs, scales, locs):
            return MixtureSameFamily(Categorical(probs=probs),
                                     Normal(loc=locs, scale=scales))

        def loss(y_pred):
            probs, scales, locs = y_pred
            return dist(probs, scales, locs)

        def cdf_cb(arg):
            probs, scales, locs = arg[..., :2], arg[..., 2:4], arg[..., 4:6]
            return dist(probs, scales, locs)

        summary_provider = PriceSampledSummary.with_reconstructor(
            sampler=wrap_applyable(lambda params, samples: cdf_cb(params).
                                   sample([int(samples.item())]),
                                   nr_args=2),
            samples=100,
            confidence=0.8)

        model = PytorchModel(module_provider=model_factory,
                             features_and_labels=fl,
                             criterion_provider=lambda: DistributionNLL(
                                 loss, penalize_toal_variance_lambda=1.1),
                             optimizer_provider=Adam,
                             summary_provider=summary_provider)

        fit = df.model.fit(
            model,
            FittingParameter(epochs=10,
                             batch_size=6,
                             splitter=naive_splitter(0.25)),
            #verbose=1,
            callbacks=[
                TestConfidenceInterval(
                    TestConfidenceInterval.CdfConfidenceInterval(
                        wrap_applyable(
                            lambda params, val: cdf_cb(params).cdf(val),
                            nr_args=2),
                        interval=0.8),
                    wrap_applyable(lambda params: cdf_cb(params).variance),
                    early_stopping=True)
            ])

        print(fit.test_summary.calc_scores())
Beispiel #11
0
    def feature_selection(
            self,
            features_and_labels: FeaturesAndLabels,
            training_data_splitter: Callable = naive_splitter(0.2),
            correlated_features_th: float = 0.75,
            rfecv_splits: int = 4,
            forest_splits: int = 7,
            min_features_to_select: int = 1,
            is_time_series: bool = False,
            **kwargs):
        assert features_and_labels.label_type in ('regression', 'classification', int, float, bool), \
            "label_type need to be specified: 'regression' | 'classification' !"

        # find best parameters
        with self() as m:
            # extract features and labels
            ext = m.extract(features_and_labels)

            # first perform a correlation analysis and remove correlating features !!!
            if correlated_features_th > 0:
                _, pairs = get_correlation_pairs(ext.features)
                redundant_correlated_features = {
                    i[0]: p
                    for i, p in pairs.items() if p > correlated_features_th
                }
                _log.warning(
                    f"drop redundant features: {redundant_correlated_features}"
                )

                features_and_labels = PostProcessedFeaturesAndLabels.from_features_and_labels(
                    features_and_labels,
                    feature_post_processor=lambda df: df.drop(
                        redundant_correlated_features.keys(), axis=1))

            # estimate model type and sample properties
            is_classification = 'float' not in (str(
                features_and_labels.label_type))
            nr_samples = len(self.df)

            if is_classification:
                nr_classes = len(ext.labels.value_counts())
            else:
                nr_classes = max(len(self.df) / 3, 100)

            # estimate grid search parameters
            grid = {
                "estimator__n_estimators":
                sp_randint(10, 500),
                "estimator__max_depth": [2, None],
                "estimator__min_samples_split":
                sp_randint(2, nr_samples / nr_classes),
                "estimator__min_samples_leaf":
                sp_randint(2, nr_samples / nr_classes),
                "estimator__bootstrap": [True, False],
                "estimator__criterion":
                ["gini", "entropy"] if is_classification else ["mse", "mae"]
            }

            # build model
            cross_validation = TimeSeriesSplit if is_time_series else StratifiedKFold if is_classification else KFold
            estimator = RandomForestClassifier(
            ) if is_classification else RandomForestRegressor()
            selector = RFECV(estimator,
                             step=1,
                             cv=cross_validation(rfecv_splits),
                             min_features_to_select=min_features_to_select)
            skm = RandomizedSearchCV(selector,
                                     param_distributions=grid,
                                     cv=cross_validation(forest_splits),
                                     n_jobs=-1)

            # fit model
            fit = m.fit(
                SkModel(skm,
                        features_and_labels=features_and_labels,
                        summary_provider=FeatureSelectionSummary),
                FittingParameter(splitter=training_data_splitter))

        # we hide the loss plot from this summary
        return fit.with_hidden_loss_plot()