Ejemplo n.º 1
0
    def test_intersection_of_index(self):
        df1 = pd.DataFrame({}, index=[1, 2, 3, 4])
        df2 = pd.DataFrame({}, index=[2, 3, 4])
        df3 = pd.DataFrame({}, index=[1, 3, 4])

        index = intersection_of_index(df1, df2, df3)

        self.assertListEqual([3, 4], index.tolist())
Ejemplo n.º 2
0
    def test_intersection_of_tuples(self):
        df1 = pd.DataFrame({}, index=[1, 2, 3, 4])
        df2 = pd.DataFrame({}, index=[2, 3, 4])
        df3 = pd.DataFrame({}, index=[1, 3, 4])

        index1 = intersection_of_index(df1,
                                       MultiFrameDecorator([df2, df3], True))
        index2 = intersection_of_index(MultiFrameDecorator([df1, df2], True),
                                       df3)

        self.assertListEqual([3, 4], index1.tolist())
        self.assertListEqual([3, 4], index2.tolist())
Ejemplo n.º 3
0
    def test_similar_columns_multi_index(self):
        df1 = pd.DataFrame({},
                           index=[1, 2, 3, 4],
                           columns=pd.MultiIndex.from_product([["a", "b"],
                                                               range(3)]))
        df2 = pd.DataFrame({},
                           index=[1, 2, 3, 4],
                           columns=pd.MultiIndex.from_tuples([
                               ("a", 1), ("a", 2), ("b", 1), ("b", 3)
                           ]))

        self.assertTrue(same_columns_after_level(df1))
        self.assertFalse(same_columns_after_level(df2))
Ejemplo n.º 4
0
    def trails(self):
        """
        In case of hyper parameter optimization a trails object as used by `Hyperopt <https://github.com/hyperopt/hyperopt/wiki/FMin>`_
        is available.

        :return: Trails object
        """
        if self._trails is not None:
            return pd.DataFrame(self._trails.results)\
                     .drop("parameter", axis=1)\
                     .join(pd.DataFrame([r['parameter'] for r in self._trails.results]))
        else:
            return None
Ejemplo n.º 5
0
def map_prediction_to_target(df, prediction, targets):
    def _round(val, d):
        return round(val, d) if isinstance(val, float) else val

    dfp = get_pandas_object(df, prediction)
    p = dfp._.values.reshape((len(df), -1))

    dft = get_pandas_object(df, targets)
    t = dft._.values.reshape((len(df), -1))

    if p.shape[1] == t.shape[1]:
        # 1:1 mapping
        index = [(date, _round(target, 2)) for date in df.index for target in dft.loc[date].values]
    elif p.shape[1] == t.shape[1] - 1:
        # we need to build ranges
        def build_tuples(l):
            return [(_round(l[i - 1], 2), _round(l[i], 2)) for i in range(1, len(l))]

        index = [(date, f"{target}") for date in df.index for target in
                 build_tuples(dft.loc[date].tolist())]
    elif p.shape[1] == t.shape[1] + 1:
        # mapping of the left and right extremes using +/- inf
        def build_tuples(l):
            l = [-np.inf, *l, np.inf]
            return [(_round(l[i - 1], 2), _round(l[i], 2)) for i in range(1, len(l))]

        index = [(date, target) for date in df.index for target in
                 build_tuples(dft.loc[date]._.values.tolist())]
    else:
        raise ValueError(f"unable to match {p.shape[1]} predictions to {t.shape[1]} +/-1 targets")

    return pd.DataFrame({"prediction": p.reshape((-1,))},
                        index=pd.MultiIndex.from_tuples(index))
Ejemplo n.º 6
0
    def test_call_if_not_none(self):
        df1 = pd.DataFrame({"a": [np.nan]})
        df2 = None

        self.assertIsNone(call_if_not_none(df2, 'dropna'))
        self.assertEqual(0, len(call_if_not_none(df1, 'dropna')))
        self.assertEqual(1, len(df1))
Ejemplo n.º 7
0
    def test_get_pandas_obj(self):
        df = pd.DataFrame({"hallo": [1, 2, 3]})
        dfmi = df.copy()
        dfmi.columns = pd.MultiIndex.from_product([["a"], dfmi.columns])

        self.assertIsNone(get_pandas_object(df, None))
        self.assertListEqual([1, 2, 3],
                             get_pandas_object(df, "hallo").to_list())
        self.assertListEqual([9, 9, 9],
                             get_pandas_object(df, Constant(9)).to_list())
        self.assertListEqual(
            [2., 4., 6.],
            get_pandas_object(
                df, 2.0, {
                    float: lambda df, item, **kwargs: df["hallo"] * item
                }).to_list())
        self.assertListEqual([2, 4, 6],
                             get_pandas_object(df, df["hallo"] * 2).to_list())
        self.assertListEqual([2, 4, 6],
                             get_pandas_object(
                                 df, lambda df: df["hallo"] * 2).to_list())
        self.assertEqual((3, 0), get_pandas_object(df, "allo").shape)

        self.assertListEqual([1, 2, 3],
                             get_pandas_object(dfmi,
                                               "hallo").iloc[:, 0].to_list())
        self.assertListEqual([1, 2, 3],
                             get_pandas_object(dfmi, "a").iloc[:, 0].to_list())
        self.assertListEqual([1, 2, 3],
                             get_pandas_object(dfmi,
                                               ".*ll.*").iloc[:, 0].to_list())
        self.assertEqual((3, 0), get_pandas_object(dfmi, "allo").shape)
Ejemplo n.º 8
0
    def test_partial_fit_classification(self):
        data = make_classification(100, 2, 1, 0, n_clusters_per_class=1)
        df = pd.DataFrame(data[0])
        df["label"] = data[1]

        with df.model() as m:
            fit_partial = m.fit(
                SkModel(
                    MLPClassifier(max_iter=1, random_state=42),
                    FeaturesAndLabels(features=[0, 1], labels=['label']),
                    classes=np.unique(data[1])
                ),
                FittingParameter(
                    stratified_random_splitter(0.3),
                    batch_size=10,
                    fold_epochs=10,
                )
            )

        with df.model() as m:
            fit = m.fit(
                SkModel(
                    MLPClassifier(max_iter=10, random_state=42),
                    FeaturesAndLabels(features=[0, 1], labels=['label'])
                ),
                FittingParameter(stratified_random_splitter(0.3))
            )

        self.assertAlmostEqual(df.model.predict(fit.model).iloc[0,-1], df.model.predict(fit_partial.model).iloc[0,-1], 4)
Ejemplo n.º 9
0
    def test_multindex_row_multi_samples(self):
        """given some toy regression data while we provide a multiindex for the rows"""
        df = pd.DataFrame(
            {
                "a":
                [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0],
                "b": [
                    -2.0, 1.0, 4.0, 7.0, 10.0, 13.0, -2.0, 1.0, 4.0, 7.0, 10.0,
                    13.0
                ]
            },
            index=pd.MultiIndex.from_product([["A", "B"], range(6)]))
        """and a model"""
        model = self.provide_regression_model(
            FeaturesAndLabels(features=["a"], labels=["b"]))
        """when we fit the model"""
        batch_size, epochs = self.provide_batch_size_and_epoch()
        with df.model() as m:
            fit = m.fit(model,
                        FittingParameter(splitter=random_splitter(
                            0.3, partition_row_multi_index=True),
                                         batch_size=batch_size,
                                         epochs=epochs),
                        verbose=0)

        self.assertEqual(8, len(fit.training_summary.df))
        self.assertEqual(4, len(fit.test_summary.df))

        prediction = df.model.predict(fit.model, samples=2)
        self.assertEqual(2, len(prediction.iloc[:, 0]._.values))
        self.assertEqual((6, 2), prediction.loc["A"].iloc[:, 0]._.values.shape)
        self.assertEqual((6, 2), prediction.loc["B"].iloc[:, 0]._.values.shape)
Ejemplo n.º 10
0
    def test__item_multi_index(self):
        df = pd.DataFrame({},
                          columns=pd.MultiIndex.from_product([[a_x, b_x, c_x],
                                                              [a_y, b_y,
                                                               c_y]]))

        # pandas_ml_common_test ordinary access single
        for col in df.columns.tolist():
            self.assertEqual(col, df._[col].name)

        # pandas_ml_common_test ordinary access multi
        cols = []
        for col in df.columns.tolist():
            cols.append(col)
            self.assertEqual(cols, df._[cols].columns.tolist())

        # pandas_ml_common_test 1st level
        self.assertListEqual([a_y, b_y, c_y], df._[a_x].columns.tolist())

        # pandas_ml_common_test 2nd level
        self.assertListEqual([(b_y, a_x), (b_y, b_x), (b_y, c_x)],
                             df._[b_y].columns.tolist())

        # pandas_ml_common_test regex
        self.assertListEqual([(a_x, a_y), (a_x, b_y), (a_x, c_y), (b_x, a_y),
                              (c_x, a_y)], df._["a_."].columns.tolist())
Ejemplo n.º 11
0
 def test_regressor(self):
     """given some toy regression data"""
     df = pd.DataFrame({
         "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0],
         "b": [-2.0, 1.0, 4.0, 7.0, 10.0, 13.0]
     })
     """and a model"""
     model = self.provide_regression_model(
         FeaturesAndLabels(features=["a"], labels=["b"]))
     """when we fit the model"""
     fit = df.model.fit(model, RandomSplits(0.3), verbose=0, epochs=500)
     print(fit.training_summary.df)
     """then we can predict"""
     prediction = df.model.predict(fit.model)
     np.testing.assert_array_almost_equal(prediction.iloc[:, 0].values,
                                          df["b"].values, 1)
     """and save and load the model"""
     temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
     try:
         fit.model.save(temp)
         copy = Model.load(temp)
         pd.testing.assert_frame_equal(df.model.predict(fit.model),
                                       df.model.predict(copy),
                                       check_less_precise=True)
     finally:
         os.remove(temp)
Ejemplo n.º 12
0
    def test_custom_objects(self):
        df = pd.DataFrame({
            "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0],
            "b": [-2.0, 1.0, 4.0, 7.0, 10.0, 13.0]
        })

        def model_provider():
            model = Sequential([Dense(units=1, input_shape=(1, ))])

            model.compile(optimizer='sgd', loss=custom_loss_function)
            return model, custom_loss_function

        model = KerasModel(
            model_provider,
            FeaturesAndLabels(["a"], ["b"]),
        )

        fit = df.model.fit(model)
        temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        try:
            fit.model.save(temp)
            copy = Model.load(temp)
            pd.testing.assert_frame_equal(df.model.predict(fit.model),
                                          df.model.predict(copy),
                                          check_less_precise=True)
        finally:
            os.remove(temp)
Ejemplo n.º 13
0
    def test_partial_fit_regression(self):
        data = make_regression(100, 2, 1)
        df = pd.DataFrame(data[0])
        df["label"] = data[1]

        with df.model() as m:
            fit_partial = m.fit(
                SkModel(
                    MLPRegressor(max_iter=1, random_state=42),
                    FeaturesAndLabels(features=[0, 1], labels=['label'])
                ),
                FittingParameter(
                    naive_splitter(0.3),
                    batch_size=10,
                    fold_epochs=10
                )
            )

        with df.model() as m:
            fit = m.fit(
                SkModel(
                    MLPRegressor(max_iter=10, random_state=42),
                    FeaturesAndLabels(features=[0, 1], labels=['label'])
                ),
                FittingParameter(naive_splitter(0.3))
            )

        self.assertAlmostEqual(df.model.predict(fit.model).iloc[0,-1], df.model.predict(fit_partial.model).iloc[0,-1], 4)
Ejemplo n.º 14
0
    def _gross_confusion(self):
        cm = np.empty(self.confusion_indices.shape)
        for i in np.ndindex(cm.shape):
            cm[i] = self.df_gross_loss["loss"].loc[
                self.confusion_indices[i]].clip(
                    upper=self.clip_profit_at).mean()

        return pd.DataFrame(cm)
Ejemplo n.º 15
0
    def test_multi_model(self):
        """given some toy classification data"""
        df = pd.DataFrame({
            "a": [
                1,
                0,
                1,
                0,
                1,
                0,
                1,
                0,
            ],
            "b": [
                0,
                0,
                1,
                1,
                0,
                0,
                1,
                1,
            ],
            "c": [
                1,
                0,
                0,
                1,
                1,
                0,
                0,
                1,
            ],
        })

        model = MultiModel(SkModel(
            MLPClassifier(activation='logistic',
                          max_iter=1000,
                          hidden_layer_sizes=(3, ),
                          alpha=0.001,
                          solver='lbfgs',
                          random_state=42),
            FeaturesAndLabels(features=["a", "b"],
                              labels=[lambda df, i: df["c"].rename(f"c_{i}")],
                              label_type=int),
            summary_provider=ClassificationSummary),
                           2,
                           model_index_variable="i",
                           summary_provider=MultiModelSummary)

        fit = df.model.fit(model,
                           NaiveSplitter(0.49),
                           epochs=1500,
                           verbose=True)
        print(fit.training_summary._repr_html_()[:100])

        pdf = df.model.predict(fit.model, tail=2)
        print(pdf)
Ejemplo n.º 16
0
    def _init(self) -> Tuple[np.ndarray, np.ndarray]:
        # this is reinforcement learning, we can only extract features (no labels)!
        self._symbol = np.random.choice(self.symbols, 1).item()
        self._df = self.cache.get_data_or_fetch(self._symbol)
        self._features, self._features_index = self.cache.get_feature_frames_or_fetch(
            self._df, self._symbol, self.features_and_labels)
        self._is_feature_tuple = isinstance(self._features, tuple)
        nr_of_samples = len(
            self._features[0]) if self._is_feature_tuple else len(
                self._features)

        if self.mode in ['train', 'test']:
            if self.mode == 'train':
                self._last_index = int(nr_of_samples * 0.8)
                # allow at least one step
                self._state_idx = np.random.randint(1, self._last_index - 1,
                                                    1).item()
            if self.mode == 'test':
                self._last_index = nr_of_samples
                # if min samples is infinity then we start from the first index of the test data
                test_start_idx = int(nr_of_samples * 0.8)
                test_end_idx = min(nr_of_samples - self.min_training_samples,
                                   test_start_idx + 1)
                self._state_idx = np.random.randint(test_start_idx,
                                                    test_end_idx, 1).item()
        else:
            self._last_index = nr_of_samples
            self._labels = pd.DataFrame({})
            self._sample_weights = pd.DataFrame({})
            self._gross_loss = pd.DataFrame({})
            self._state_idx = 0

        if self.observation_space is None:
            self.observation_space = gym.spaces.Tuple(
                (SpaceUtils.unbounded_tuple_boxes(
                    *[f.shape[1:]
                      for f in self._features]) if self._is_feature_tuple else
                 SpaceUtils.unbounded_box(self._features.shape[1:]),
                 SpaceUtils.unbounded_box(
                     self.strategy.current_state().shape)))

        self._start_idx = self._state_idx
        self.done = self._state_idx >= self._last_index
        return self._current_state()
Ejemplo n.º 17
0
    def test_auto_encoder(self):
        """given the implementation can handle auto encoders"""
        model = self.provide_auto_encoder_model(
            FeaturesAndLabels(["a", "b"], ["a", "b"]))
        if model is None:
            return
        """and some toy classification data"""
        df = pd.DataFrame({
            "a": [
                1,
                0,
                1,
                0,
            ],
            "b": [
                0,
                1,
                0,
                1,
            ],
        })
        """when we fit the model"""
        fit = df.model.fit(model, NaiveSplitter(0.49), verbose=0, epochs=500)
        print(fit.training_summary.df)
        """then we can encoder"""
        encoded_prediction = df.model.predict(fit.model.as_encoder())
        print(encoded_prediction)
        """and we can decoder"""
        decoder_features = encoded_prediction.columns.to_list()[0:1]
        decoded_prediction = encoded_prediction.model.predict(
            fit.model.as_decoder(decoder_features))
        print(decoded_prediction)
        np.testing.assert_array_almost_equal(
            decoded_prediction["prediction"].values > 0.5, df[["a",
                                                               "b"]].values)
        """and we can encoder and decore after safe and load"""
        temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        try:
            fit.model.save(temp)
            copy = Model.load(temp)

            pd.testing.assert_frame_equal(df.model.predict(
                fit.model.as_encoder()),
                                          df.model.predict(copy.as_encoder()),
                                          check_less_precise=True)

            pd.testing.assert_frame_equal(
                encoded_prediction.model.predict(
                    fit.model.as_decoder(decoder_features)),
                encoded_prediction.model.predict(
                    copy.as_decoder(decoder_features)),
                check_less_precise=True)
        finally:
            os.remove(temp)
Ejemplo n.º 18
0
 def test_make_training_data(self):
     """given"""
     df = pd.DataFrame({
         "featureA": [1, 2, 3, 4, 5],
         "labelA": [1, 2, 3, 4, 5]
     })
     """when"""
     train_ix, test_ix = random_splitter(test_size=0.5)(df.index)
     """then"""
     self.assertEqual(2, len(train_ix))
     self.assertEqual(3, len(test_ix))
 def test_multi_index_nested_values(self):
     df = pd.DataFrame(
         {
             ("A", "a"): [1, 2, 3, 4, 5],
             ("A", "b"): [3, 2, 1, 0, 0],
             ("A", "c"): [3, 2, 1, 0, 0],
             ("B", "a"): [1, 2, 3, 1, 2],
             ("B", "b"): [3, 2, 1, 0, 1],
             ("B", "c"): [3, 2, 1, 0, 1],
             ("C", "a"): [
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4))
             ],
             ("C", "b"): [
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4))
             ],
             ("C", "c"): [
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4))
             ],
             ("D", "a"): [
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4)),
                 np.ones((2, 4))
             ],
         },
         index=[1, 2, 3, 4, 5],
     )
     df.columns = pd.MultiIndex.from_tuples(df.columns.tolist())
     """when"""
     print(df)
     rnnShape = df[["A"]].ml.values
     rnnShape2 = df[["A", "B"]].ml.values
     rnnShapeExt = df["C"].ml.values
     labelShape = df["D"].ml.values
     """then"""
     print(rnnShape.shape, rnnShape2.shape, rnnShapeExt.shape,
           labelShape.shape)
     self.assertEqual((5, 1, 3), rnnShape.shape)
     self.assertEqual((5, 2, 3), rnnShape2.shape)
     self.assertEqual((5, 3, 2, 4), rnnShapeExt.shape)
     self.assertEqual((5, 1, 2, 4), labelShape.shape)
Ejemplo n.º 20
0
 def test_naive_splitter(self):
     """given"""
     df = pd.DataFrame({
         "featureA": [1, 2, 3, 4, 5],
         "labelA": [1, 2, 3, 4, 5]
     })
     """when"""
     train_ix, test_ix = naive_splitter(0.3)(df.index)
     """then"""
     print(train_ix, test_ix)
     self.assertListEqual([0, 1, 2], train_ix.tolist())
     self.assertListEqual([3, 4], test_ix.tolist())
Ejemplo n.º 21
0
    def test_youngest_portion(self):
        """given"""
        df = pd.DataFrame({
            "featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            "labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        })
        """when"""
        train_ix, test_ix = random_splitter(test_size=0.6,
                                            youngest_size=0.25)(df.index)

        "then"
        self.assertEqual(6, len(test_ix))
        np.testing.assert_array_equal(test_ix[-2:], np.array([8, 9]))
Ejemplo n.º 22
0
 def test_no_training_data(self):
     """given"""
     df = pd.DataFrame({
         "featureA": [1, 2, 3, 4, 5],
         "labelA": [1, 2, 3, 4, 5]
     })
     """when"""
     train_ix, test_ix = random_splitter(0)(df.index)
     train_ix2, test_ix2 = dummy_splitter(df.index)
     """then"""
     np.testing.assert_array_almost_equal(train_ix.values, df.index.values)
     np.testing.assert_array_almost_equal(train_ix.values, train_ix2.values)
     self.assertEqual(0, len(test_ix))
Ejemplo n.º 23
0
 def test_stratified_random_splitter(self):
     """given"""
     df = pd.DataFrame({
         "featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
         "labelA": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3]
     })
     """when"""
     train_ix, test_ix = stratified_random_splitter(test_size=0.5)(
         df.index, y=df[["labelA"]])
     """then each class is represented similarly often in each train and test set"""
     self.assertIn(2, df.loc[train_ix, "labelA"].to_list())
     self.assertIn(3, df.loc[train_ix, "labelA"].to_list())
     self.assertIn(2, df.loc[test_ix, "labelA"].to_list())
     self.assertIn(3, df.loc[test_ix, "labelA"].to_list())
Ejemplo n.º 24
0
    def test_naive_splitter_multi_index_row(self):
        """given"""
        df = pd.DataFrame({"featureA": range(10), "labelA": range(10)})

        df.index = pd.MultiIndex.from_product([["A", "B"], range(5)])
        """when"""
        train_ix, test_ix = naive_splitter(
            0.3, partition_row_multi_index=True)(df.index)
        """then"""
        print(train_ix.tolist(), test_ix.tolist())
        self.assertListEqual([('A', 0), ('A', 1), ('A', 2), ('B', 0), ('B', 1),
                              ('B', 2)], train_ix.tolist())
        self.assertListEqual([('A', 3), ('A', 4), ('B', 3), ('B', 4)],
                             test_ix.tolist())
Ejemplo n.º 25
0
    def test_random_splitter_multi_index_row(self):
        """given"""
        df = pd.DataFrame({"featureA": range(10), "labelA": range(10)})

        df.index = pd.MultiIndex.from_product([["A", "B"], range(5)])
        """when"""
        train_ix, test_ix = random_splitter(test_size=0.6,
                                            youngest_size=0.25,
                                            partition_row_multi_index=True)(
                                                df.index)
        print(train_ix.tolist(), test_ix.tolist())
        """then"""
        self.assertEqual(8, len(test_ix))
        self.assertIn(('A', 4), test_ix)
        self.assertIn(('B', 4), test_ix)
Ejemplo n.º 26
0
    def test_add_multi_index(self):
        df = pd.DataFrame({}, index=[1, 2, 3, 4])
        df1 = add_multi_index(df, "A", axis=0)
        df2 = add_multi_index(df1, "B", axis=0, level=1)
        df3 = add_multi_index(df1, "B", axis=0, level=2)
        # print(df3)

        self.assertListEqual(df1.index.to_list(), [("A", 1), ("A", 2),
                                                   ("A", 3), ("A", 4)])
        self.assertListEqual(df2.index.to_list(),
                             [("A", "B", 1), ("A", "B", 2), ("A", "B", 3),
                              ("A", "B", 4)])
        self.assertListEqual(df3.index.to_list(),
                             [(1, "A", "B"), (2, "A", "B"), (3, "A", "B"),
                              (4, "A", "B")])
Ejemplo n.º 27
0
    def test__item_normal_index(self):
        df = pd.DataFrame({}, columns=[a_x, b_y, c_y])

        # pandas_ml_common_test ordinary access
        for col in df.columns.tolist():
            self.assertEqual(col, df._[col].name)

        # pandas_ml_common_test ordinary access multi
        cols = []
        for col in df.columns.tolist():
            cols.append(col)
            self.assertEqual(cols, df._[cols].columns.tolist())

        # pandas_ml_common_test regex
        self.assertListEqual([a_x], df._["a_."].columns.tolist())
        self.assertListEqual([b_y, c_y], df._["..y"].columns.tolist())
Ejemplo n.º 28
0
    def test_stratified_random_splitter_multi_index_row(self):
        """given"""
        df = pd.DataFrame({
            "featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 2,
            "labelA": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3] * 2
        })

        df.index = pd.MultiIndex.from_product([["A", "B"], range(10)])
        """when"""
        train_ix, test_ix = stratified_random_splitter(
            test_size=0.5, partition_row_multi_index=True)(df.index,
                                                           y=df[["labelA"]])
        print(train_ix.tolist(), test_ix.tolist())
        """then each class is represented similarly often in each train and test set"""
        self.assertIn(2, df.loc[train_ix, "labelA"].to_list())
        self.assertIn(3, df.loc[train_ix, "labelA"].to_list())
        self.assertIn(2, df.loc[test_ix, "labelA"].to_list())
        self.assertIn(3, df.loc[test_ix, "labelA"].to_list())
Ejemplo n.º 29
0
 def test_lag_smoothing_nan(self):
     """given"""
     df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
     #                               1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # original
     #                                  1, 2, 3, 4, 5, 6, 7, 8, 9]   # lag 1
     #                                        1, 2, 3, 4, 5, 6, 7]   # lag 1 + shift 2
     #                                        ^                      # this is where the df starts
     """when lag smoothing is enabled using shift (which is introducing nan into the data frame)"""
     rnn = lag_columns(df[["featureA"]],
                       feature_lags=[0, 1],
                       lag_smoothing={
                           1: lambda df: df["featureA"].shift(2)
                       }).dropna()
     """then"""
     self.assertAlmostEqual(rnn[0, "featureA"].iloc[0], 4)
     self.assertAlmostEqual(rnn[1, "featureA"].iloc[0], 1.0)
     self.assertAlmostEqual(rnn[0, "featureA"].iloc[-1], 10)
     self.assertAlmostEqual(rnn[1, "featureA"].iloc[-1], 7.0)
Ejemplo n.º 30
0
    def __init__(self,
                 df: Typing.PatchedDataFrame,
                 clip_profit_at=0,
                 classes=None,
                 **kwargs):
        super().__init__(df)
        self.clip_profit_at = clip_profit_at
        self.targets = df[TARGET_COLUMN_NAME]

        # calculate confusion indices
        truth, prediction = self._fix_label_prediction_representation()
        distinct_values = len({*truth.reshape(
            (-1, ))}) if classes is None else classes
        cm = empty_lists((distinct_values, distinct_values))

        for i, (t, p) in enumerate(zip(truth, prediction)):
            cm[int(t), int(p)].append(self.df.index[i])

        self.confusion_indices = cm

        # we can calculate the gross loss from the predicted band and the true price,
        #  therefore we need to pass the true price as gross loss such that we calculate the real loss
        self.df_gross_loss = pd.DataFrame(
            {
                "bucket":
                df[[TARGET_COLUMN_NAME]].apply(get_buckets, axis=1, raw=True),
                "pidx":
                df.apply(
                    lambda r: int(r[PREDICTION_COLUMN_NAME]._.values.argmax()),
                    axis=1,
                    raw=False),
                "price":
                df[GROSS_LOSS_COLUMN_NAME].values[:, 0]
            },
            index=df.index)

        # find target for predicted value
        mid = self.targets.shape[1] / 2.0
        self.df_gross_loss["loss"] = self.df_gross_loss.apply(
            lambda r: (r["price"] - r["bucket"][r["pidx"]][0])
            if r["pidx"] <= mid else (r["bucket"][r["pidx"]][1] - r["price"]),
            axis=1,
            raw=False).fillna(0)