def test_intersection_of_index(self): df1 = pd.DataFrame({}, index=[1, 2, 3, 4]) df2 = pd.DataFrame({}, index=[2, 3, 4]) df3 = pd.DataFrame({}, index=[1, 3, 4]) index = intersection_of_index(df1, df2, df3) self.assertListEqual([3, 4], index.tolist())
def test_intersection_of_tuples(self): df1 = pd.DataFrame({}, index=[1, 2, 3, 4]) df2 = pd.DataFrame({}, index=[2, 3, 4]) df3 = pd.DataFrame({}, index=[1, 3, 4]) index1 = intersection_of_index(df1, MultiFrameDecorator([df2, df3], True)) index2 = intersection_of_index(MultiFrameDecorator([df1, df2], True), df3) self.assertListEqual([3, 4], index1.tolist()) self.assertListEqual([3, 4], index2.tolist())
def test_similar_columns_multi_index(self): df1 = pd.DataFrame({}, index=[1, 2, 3, 4], columns=pd.MultiIndex.from_product([["a", "b"], range(3)])) df2 = pd.DataFrame({}, index=[1, 2, 3, 4], columns=pd.MultiIndex.from_tuples([ ("a", 1), ("a", 2), ("b", 1), ("b", 3) ])) self.assertTrue(same_columns_after_level(df1)) self.assertFalse(same_columns_after_level(df2))
def trails(self): """ In case of hyper parameter optimization a trails object as used by `Hyperopt <https://github.com/hyperopt/hyperopt/wiki/FMin>`_ is available. :return: Trails object """ if self._trails is not None: return pd.DataFrame(self._trails.results)\ .drop("parameter", axis=1)\ .join(pd.DataFrame([r['parameter'] for r in self._trails.results])) else: return None
def map_prediction_to_target(df, prediction, targets): def _round(val, d): return round(val, d) if isinstance(val, float) else val dfp = get_pandas_object(df, prediction) p = dfp._.values.reshape((len(df), -1)) dft = get_pandas_object(df, targets) t = dft._.values.reshape((len(df), -1)) if p.shape[1] == t.shape[1]: # 1:1 mapping index = [(date, _round(target, 2)) for date in df.index for target in dft.loc[date].values] elif p.shape[1] == t.shape[1] - 1: # we need to build ranges def build_tuples(l): return [(_round(l[i - 1], 2), _round(l[i], 2)) for i in range(1, len(l))] index = [(date, f"{target}") for date in df.index for target in build_tuples(dft.loc[date].tolist())] elif p.shape[1] == t.shape[1] + 1: # mapping of the left and right extremes using +/- inf def build_tuples(l): l = [-np.inf, *l, np.inf] return [(_round(l[i - 1], 2), _round(l[i], 2)) for i in range(1, len(l))] index = [(date, target) for date in df.index for target in build_tuples(dft.loc[date]._.values.tolist())] else: raise ValueError(f"unable to match {p.shape[1]} predictions to {t.shape[1]} +/-1 targets") return pd.DataFrame({"prediction": p.reshape((-1,))}, index=pd.MultiIndex.from_tuples(index))
def test_call_if_not_none(self): df1 = pd.DataFrame({"a": [np.nan]}) df2 = None self.assertIsNone(call_if_not_none(df2, 'dropna')) self.assertEqual(0, len(call_if_not_none(df1, 'dropna'))) self.assertEqual(1, len(df1))
def test_get_pandas_obj(self): df = pd.DataFrame({"hallo": [1, 2, 3]}) dfmi = df.copy() dfmi.columns = pd.MultiIndex.from_product([["a"], dfmi.columns]) self.assertIsNone(get_pandas_object(df, None)) self.assertListEqual([1, 2, 3], get_pandas_object(df, "hallo").to_list()) self.assertListEqual([9, 9, 9], get_pandas_object(df, Constant(9)).to_list()) self.assertListEqual( [2., 4., 6.], get_pandas_object( df, 2.0, { float: lambda df, item, **kwargs: df["hallo"] * item }).to_list()) self.assertListEqual([2, 4, 6], get_pandas_object(df, df["hallo"] * 2).to_list()) self.assertListEqual([2, 4, 6], get_pandas_object( df, lambda df: df["hallo"] * 2).to_list()) self.assertEqual((3, 0), get_pandas_object(df, "allo").shape) self.assertListEqual([1, 2, 3], get_pandas_object(dfmi, "hallo").iloc[:, 0].to_list()) self.assertListEqual([1, 2, 3], get_pandas_object(dfmi, "a").iloc[:, 0].to_list()) self.assertListEqual([1, 2, 3], get_pandas_object(dfmi, ".*ll.*").iloc[:, 0].to_list()) self.assertEqual((3, 0), get_pandas_object(dfmi, "allo").shape)
def test_partial_fit_classification(self): data = make_classification(100, 2, 1, 0, n_clusters_per_class=1) df = pd.DataFrame(data[0]) df["label"] = data[1] with df.model() as m: fit_partial = m.fit( SkModel( MLPClassifier(max_iter=1, random_state=42), FeaturesAndLabels(features=[0, 1], labels=['label']), classes=np.unique(data[1]) ), FittingParameter( stratified_random_splitter(0.3), batch_size=10, fold_epochs=10, ) ) with df.model() as m: fit = m.fit( SkModel( MLPClassifier(max_iter=10, random_state=42), FeaturesAndLabels(features=[0, 1], labels=['label']) ), FittingParameter(stratified_random_splitter(0.3)) ) self.assertAlmostEqual(df.model.predict(fit.model).iloc[0,-1], df.model.predict(fit_partial.model).iloc[0,-1], 4)
def test_multindex_row_multi_samples(self): """given some toy regression data while we provide a multiindex for the rows""" df = pd.DataFrame( { "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0], "b": [ -2.0, 1.0, 4.0, 7.0, 10.0, 13.0, -2.0, 1.0, 4.0, 7.0, 10.0, 13.0 ] }, index=pd.MultiIndex.from_product([["A", "B"], range(6)])) """and a model""" model = self.provide_regression_model( FeaturesAndLabels(features=["a"], labels=["b"])) """when we fit the model""" batch_size, epochs = self.provide_batch_size_and_epoch() with df.model() as m: fit = m.fit(model, FittingParameter(splitter=random_splitter( 0.3, partition_row_multi_index=True), batch_size=batch_size, epochs=epochs), verbose=0) self.assertEqual(8, len(fit.training_summary.df)) self.assertEqual(4, len(fit.test_summary.df)) prediction = df.model.predict(fit.model, samples=2) self.assertEqual(2, len(prediction.iloc[:, 0]._.values)) self.assertEqual((6, 2), prediction.loc["A"].iloc[:, 0]._.values.shape) self.assertEqual((6, 2), prediction.loc["B"].iloc[:, 0]._.values.shape)
def test__item_multi_index(self): df = pd.DataFrame({}, columns=pd.MultiIndex.from_product([[a_x, b_x, c_x], [a_y, b_y, c_y]])) # pandas_ml_common_test ordinary access single for col in df.columns.tolist(): self.assertEqual(col, df._[col].name) # pandas_ml_common_test ordinary access multi cols = [] for col in df.columns.tolist(): cols.append(col) self.assertEqual(cols, df._[cols].columns.tolist()) # pandas_ml_common_test 1st level self.assertListEqual([a_y, b_y, c_y], df._[a_x].columns.tolist()) # pandas_ml_common_test 2nd level self.assertListEqual([(b_y, a_x), (b_y, b_x), (b_y, c_x)], df._[b_y].columns.tolist()) # pandas_ml_common_test regex self.assertListEqual([(a_x, a_y), (a_x, b_y), (a_x, c_y), (b_x, a_y), (c_x, a_y)], df._["a_."].columns.tolist())
def test_regressor(self): """given some toy regression data""" df = pd.DataFrame({ "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0], "b": [-2.0, 1.0, 4.0, 7.0, 10.0, 13.0] }) """and a model""" model = self.provide_regression_model( FeaturesAndLabels(features=["a"], labels=["b"])) """when we fit the model""" fit = df.model.fit(model, RandomSplits(0.3), verbose=0, epochs=500) print(fit.training_summary.df) """then we can predict""" prediction = df.model.predict(fit.model) np.testing.assert_array_almost_equal(prediction.iloc[:, 0].values, df["b"].values, 1) """and save and load the model""" temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) try: fit.model.save(temp) copy = Model.load(temp) pd.testing.assert_frame_equal(df.model.predict(fit.model), df.model.predict(copy), check_less_precise=True) finally: os.remove(temp)
def test_custom_objects(self): df = pd.DataFrame({ "a": [-1.0, 0.0, 1.0, 2.0, 3.0, 4.0], "b": [-2.0, 1.0, 4.0, 7.0, 10.0, 13.0] }) def model_provider(): model = Sequential([Dense(units=1, input_shape=(1, ))]) model.compile(optimizer='sgd', loss=custom_loss_function) return model, custom_loss_function model = KerasModel( model_provider, FeaturesAndLabels(["a"], ["b"]), ) fit = df.model.fit(model) temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) try: fit.model.save(temp) copy = Model.load(temp) pd.testing.assert_frame_equal(df.model.predict(fit.model), df.model.predict(copy), check_less_precise=True) finally: os.remove(temp)
def test_partial_fit_regression(self): data = make_regression(100, 2, 1) df = pd.DataFrame(data[0]) df["label"] = data[1] with df.model() as m: fit_partial = m.fit( SkModel( MLPRegressor(max_iter=1, random_state=42), FeaturesAndLabels(features=[0, 1], labels=['label']) ), FittingParameter( naive_splitter(0.3), batch_size=10, fold_epochs=10 ) ) with df.model() as m: fit = m.fit( SkModel( MLPRegressor(max_iter=10, random_state=42), FeaturesAndLabels(features=[0, 1], labels=['label']) ), FittingParameter(naive_splitter(0.3)) ) self.assertAlmostEqual(df.model.predict(fit.model).iloc[0,-1], df.model.predict(fit_partial.model).iloc[0,-1], 4)
def _gross_confusion(self): cm = np.empty(self.confusion_indices.shape) for i in np.ndindex(cm.shape): cm[i] = self.df_gross_loss["loss"].loc[ self.confusion_indices[i]].clip( upper=self.clip_profit_at).mean() return pd.DataFrame(cm)
def test_multi_model(self): """given some toy classification data""" df = pd.DataFrame({ "a": [ 1, 0, 1, 0, 1, 0, 1, 0, ], "b": [ 0, 0, 1, 1, 0, 0, 1, 1, ], "c": [ 1, 0, 0, 1, 1, 0, 0, 1, ], }) model = MultiModel(SkModel( MLPClassifier(activation='logistic', max_iter=1000, hidden_layer_sizes=(3, ), alpha=0.001, solver='lbfgs', random_state=42), FeaturesAndLabels(features=["a", "b"], labels=[lambda df, i: df["c"].rename(f"c_{i}")], label_type=int), summary_provider=ClassificationSummary), 2, model_index_variable="i", summary_provider=MultiModelSummary) fit = df.model.fit(model, NaiveSplitter(0.49), epochs=1500, verbose=True) print(fit.training_summary._repr_html_()[:100]) pdf = df.model.predict(fit.model, tail=2) print(pdf)
def _init(self) -> Tuple[np.ndarray, np.ndarray]: # this is reinforcement learning, we can only extract features (no labels)! self._symbol = np.random.choice(self.symbols, 1).item() self._df = self.cache.get_data_or_fetch(self._symbol) self._features, self._features_index = self.cache.get_feature_frames_or_fetch( self._df, self._symbol, self.features_and_labels) self._is_feature_tuple = isinstance(self._features, tuple) nr_of_samples = len( self._features[0]) if self._is_feature_tuple else len( self._features) if self.mode in ['train', 'test']: if self.mode == 'train': self._last_index = int(nr_of_samples * 0.8) # allow at least one step self._state_idx = np.random.randint(1, self._last_index - 1, 1).item() if self.mode == 'test': self._last_index = nr_of_samples # if min samples is infinity then we start from the first index of the test data test_start_idx = int(nr_of_samples * 0.8) test_end_idx = min(nr_of_samples - self.min_training_samples, test_start_idx + 1) self._state_idx = np.random.randint(test_start_idx, test_end_idx, 1).item() else: self._last_index = nr_of_samples self._labels = pd.DataFrame({}) self._sample_weights = pd.DataFrame({}) self._gross_loss = pd.DataFrame({}) self._state_idx = 0 if self.observation_space is None: self.observation_space = gym.spaces.Tuple( (SpaceUtils.unbounded_tuple_boxes( *[f.shape[1:] for f in self._features]) if self._is_feature_tuple else SpaceUtils.unbounded_box(self._features.shape[1:]), SpaceUtils.unbounded_box( self.strategy.current_state().shape))) self._start_idx = self._state_idx self.done = self._state_idx >= self._last_index return self._current_state()
def test_auto_encoder(self): """given the implementation can handle auto encoders""" model = self.provide_auto_encoder_model( FeaturesAndLabels(["a", "b"], ["a", "b"])) if model is None: return """and some toy classification data""" df = pd.DataFrame({ "a": [ 1, 0, 1, 0, ], "b": [ 0, 1, 0, 1, ], }) """when we fit the model""" fit = df.model.fit(model, NaiveSplitter(0.49), verbose=0, epochs=500) print(fit.training_summary.df) """then we can encoder""" encoded_prediction = df.model.predict(fit.model.as_encoder()) print(encoded_prediction) """and we can decoder""" decoder_features = encoded_prediction.columns.to_list()[0:1] decoded_prediction = encoded_prediction.model.predict( fit.model.as_decoder(decoder_features)) print(decoded_prediction) np.testing.assert_array_almost_equal( decoded_prediction["prediction"].values > 0.5, df[["a", "b"]].values) """and we can encoder and decore after safe and load""" temp = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) try: fit.model.save(temp) copy = Model.load(temp) pd.testing.assert_frame_equal(df.model.predict( fit.model.as_encoder()), df.model.predict(copy.as_encoder()), check_less_precise=True) pd.testing.assert_frame_equal( encoded_prediction.model.predict( fit.model.as_decoder(decoder_features)), encoded_prediction.model.predict( copy.as_decoder(decoder_features)), check_less_precise=True) finally: os.remove(temp)
def test_make_training_data(self): """given""" df = pd.DataFrame({ "featureA": [1, 2, 3, 4, 5], "labelA": [1, 2, 3, 4, 5] }) """when""" train_ix, test_ix = random_splitter(test_size=0.5)(df.index) """then""" self.assertEqual(2, len(train_ix)) self.assertEqual(3, len(test_ix))
def test_multi_index_nested_values(self): df = pd.DataFrame( { ("A", "a"): [1, 2, 3, 4, 5], ("A", "b"): [3, 2, 1, 0, 0], ("A", "c"): [3, 2, 1, 0, 0], ("B", "a"): [1, 2, 3, 1, 2], ("B", "b"): [3, 2, 1, 0, 1], ("B", "c"): [3, 2, 1, 0, 1], ("C", "a"): [ np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)) ], ("C", "b"): [ np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)) ], ("C", "c"): [ np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)) ], ("D", "a"): [ np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)), np.ones((2, 4)) ], }, index=[1, 2, 3, 4, 5], ) df.columns = pd.MultiIndex.from_tuples(df.columns.tolist()) """when""" print(df) rnnShape = df[["A"]].ml.values rnnShape2 = df[["A", "B"]].ml.values rnnShapeExt = df["C"].ml.values labelShape = df["D"].ml.values """then""" print(rnnShape.shape, rnnShape2.shape, rnnShapeExt.shape, labelShape.shape) self.assertEqual((5, 1, 3), rnnShape.shape) self.assertEqual((5, 2, 3), rnnShape2.shape) self.assertEqual((5, 3, 2, 4), rnnShapeExt.shape) self.assertEqual((5, 1, 2, 4), labelShape.shape)
def test_naive_splitter(self): """given""" df = pd.DataFrame({ "featureA": [1, 2, 3, 4, 5], "labelA": [1, 2, 3, 4, 5] }) """when""" train_ix, test_ix = naive_splitter(0.3)(df.index) """then""" print(train_ix, test_ix) self.assertListEqual([0, 1, 2], train_ix.tolist()) self.assertListEqual([3, 4], test_ix.tolist())
def test_youngest_portion(self): """given""" df = pd.DataFrame({ "featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "labelA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] }) """when""" train_ix, test_ix = random_splitter(test_size=0.6, youngest_size=0.25)(df.index) "then" self.assertEqual(6, len(test_ix)) np.testing.assert_array_equal(test_ix[-2:], np.array([8, 9]))
def test_no_training_data(self): """given""" df = pd.DataFrame({ "featureA": [1, 2, 3, 4, 5], "labelA": [1, 2, 3, 4, 5] }) """when""" train_ix, test_ix = random_splitter(0)(df.index) train_ix2, test_ix2 = dummy_splitter(df.index) """then""" np.testing.assert_array_almost_equal(train_ix.values, df.index.values) np.testing.assert_array_almost_equal(train_ix.values, train_ix2.values) self.assertEqual(0, len(test_ix))
def test_stratified_random_splitter(self): """given""" df = pd.DataFrame({ "featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "labelA": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3] }) """when""" train_ix, test_ix = stratified_random_splitter(test_size=0.5)( df.index, y=df[["labelA"]]) """then each class is represented similarly often in each train and test set""" self.assertIn(2, df.loc[train_ix, "labelA"].to_list()) self.assertIn(3, df.loc[train_ix, "labelA"].to_list()) self.assertIn(2, df.loc[test_ix, "labelA"].to_list()) self.assertIn(3, df.loc[test_ix, "labelA"].to_list())
def test_naive_splitter_multi_index_row(self): """given""" df = pd.DataFrame({"featureA": range(10), "labelA": range(10)}) df.index = pd.MultiIndex.from_product([["A", "B"], range(5)]) """when""" train_ix, test_ix = naive_splitter( 0.3, partition_row_multi_index=True)(df.index) """then""" print(train_ix.tolist(), test_ix.tolist()) self.assertListEqual([('A', 0), ('A', 1), ('A', 2), ('B', 0), ('B', 1), ('B', 2)], train_ix.tolist()) self.assertListEqual([('A', 3), ('A', 4), ('B', 3), ('B', 4)], test_ix.tolist())
def test_random_splitter_multi_index_row(self): """given""" df = pd.DataFrame({"featureA": range(10), "labelA": range(10)}) df.index = pd.MultiIndex.from_product([["A", "B"], range(5)]) """when""" train_ix, test_ix = random_splitter(test_size=0.6, youngest_size=0.25, partition_row_multi_index=True)( df.index) print(train_ix.tolist(), test_ix.tolist()) """then""" self.assertEqual(8, len(test_ix)) self.assertIn(('A', 4), test_ix) self.assertIn(('B', 4), test_ix)
def test_add_multi_index(self): df = pd.DataFrame({}, index=[1, 2, 3, 4]) df1 = add_multi_index(df, "A", axis=0) df2 = add_multi_index(df1, "B", axis=0, level=1) df3 = add_multi_index(df1, "B", axis=0, level=2) # print(df3) self.assertListEqual(df1.index.to_list(), [("A", 1), ("A", 2), ("A", 3), ("A", 4)]) self.assertListEqual(df2.index.to_list(), [("A", "B", 1), ("A", "B", 2), ("A", "B", 3), ("A", "B", 4)]) self.assertListEqual(df3.index.to_list(), [(1, "A", "B"), (2, "A", "B"), (3, "A", "B"), (4, "A", "B")])
def test__item_normal_index(self): df = pd.DataFrame({}, columns=[a_x, b_y, c_y]) # pandas_ml_common_test ordinary access for col in df.columns.tolist(): self.assertEqual(col, df._[col].name) # pandas_ml_common_test ordinary access multi cols = [] for col in df.columns.tolist(): cols.append(col) self.assertEqual(cols, df._[cols].columns.tolist()) # pandas_ml_common_test regex self.assertListEqual([a_x], df._["a_."].columns.tolist()) self.assertListEqual([b_y, c_y], df._["..y"].columns.tolist())
def test_stratified_random_splitter_multi_index_row(self): """given""" df = pd.DataFrame({ "featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 2, "labelA": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3] * 2 }) df.index = pd.MultiIndex.from_product([["A", "B"], range(10)]) """when""" train_ix, test_ix = stratified_random_splitter( test_size=0.5, partition_row_multi_index=True)(df.index, y=df[["labelA"]]) print(train_ix.tolist(), test_ix.tolist()) """then each class is represented similarly often in each train and test set""" self.assertIn(2, df.loc[train_ix, "labelA"].to_list()) self.assertIn(3, df.loc[train_ix, "labelA"].to_list()) self.assertIn(2, df.loc[test_ix, "labelA"].to_list()) self.assertIn(3, df.loc[test_ix, "labelA"].to_list())
def test_lag_smoothing_nan(self): """given""" df = pd.DataFrame({"featureA": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) # 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # original # 1, 2, 3, 4, 5, 6, 7, 8, 9] # lag 1 # 1, 2, 3, 4, 5, 6, 7] # lag 1 + shift 2 # ^ # this is where the df starts """when lag smoothing is enabled using shift (which is introducing nan into the data frame)""" rnn = lag_columns(df[["featureA"]], feature_lags=[0, 1], lag_smoothing={ 1: lambda df: df["featureA"].shift(2) }).dropna() """then""" self.assertAlmostEqual(rnn[0, "featureA"].iloc[0], 4) self.assertAlmostEqual(rnn[1, "featureA"].iloc[0], 1.0) self.assertAlmostEqual(rnn[0, "featureA"].iloc[-1], 10) self.assertAlmostEqual(rnn[1, "featureA"].iloc[-1], 7.0)
def __init__(self, df: Typing.PatchedDataFrame, clip_profit_at=0, classes=None, **kwargs): super().__init__(df) self.clip_profit_at = clip_profit_at self.targets = df[TARGET_COLUMN_NAME] # calculate confusion indices truth, prediction = self._fix_label_prediction_representation() distinct_values = len({*truth.reshape( (-1, ))}) if classes is None else classes cm = empty_lists((distinct_values, distinct_values)) for i, (t, p) in enumerate(zip(truth, prediction)): cm[int(t), int(p)].append(self.df.index[i]) self.confusion_indices = cm # we can calculate the gross loss from the predicted band and the true price, # therefore we need to pass the true price as gross loss such that we calculate the real loss self.df_gross_loss = pd.DataFrame( { "bucket": df[[TARGET_COLUMN_NAME]].apply(get_buckets, axis=1, raw=True), "pidx": df.apply( lambda r: int(r[PREDICTION_COLUMN_NAME]._.values.argmax()), axis=1, raw=False), "price": df[GROSS_LOSS_COLUMN_NAME].values[:, 0] }, index=df.index) # find target for predicted value mid = self.targets.shape[1] / 2.0 self.df_gross_loss["loss"] = self.df_gross_loss.apply( lambda r: (r["price"] - r["bucket"][r["pidx"]][0]) if r["pidx"] <= mid else (r["bucket"][r["pidx"]][1] - r["price"]), axis=1, raw=False).fillna(0)