Esempio n. 1
0
    def test_no_ids_present(self):
        augmenter = RelevantFeatureAugmenter(kind_to_fc_parameters=self.kind_to_fc_parameters,
                                             filter_only_tsfresh_features=False,
                                             column_value="val", column_id="id", column_sort="sort", column_kind="kind")

        df, y = self.create_test_data_sample_with_target()
        X_with_wrong_ids = pd.DataFrame(index=[-999])

        augmenter.set_timeseries_container(df)

        self.assertRaisesRegex(AttributeError, r"The ids of the time series container",
                               augmenter.fit, X_with_wrong_ids, y)
    def test_does_impute(self, calculate_relevance_table_mock):
        df = pd.DataFrame([[1, 1, 1], [2, 1, 1]], columns=['id', 'time', 'value'])
        X = pd.DataFrame(index=[1])
        y = pd.Series([0, 1])
        fc_parameters = {"autocorrelation": [{'lag': 2}]}

        calculate_relevance_table_mock.return_value = pd.DataFrame(columns=['feature', 'p_value', 'relevant'])
        augmenter = RelevantFeatureAugmenter(column_id='id', column_sort='time', default_fc_parameters=fc_parameters)
        augmenter.set_timeseries_container(df)
        augmenter.fit(X, y)

        assert calculate_relevance_table_mock.call_count == 1
        assert not calculate_relevance_table_mock.call_args[0][0].isnull().any().any()
Esempio n. 3
0
    def test_filter_only_tsfresh_features_true(self):
        """
        The boolean flag `filter_only_tsfresh_features` makes sure that only the time series based features are
        filtered. This unit tests checks that
        """

        augmenter = RelevantFeatureAugmenter(
            kind_to_fc_parameters=self.kind_to_fc_parameters,
            filter_only_tsfresh_features=True,
            column_value="val",
            column_id="id",
            column_sort="sort",
            column_kind="kind")

        y = pd.Series({10: 1, 500: 0})
        X = pd.DataFrame(index=[10, 500])
        X["pre_feature"] = 0

        augmenter.set_timeseries_container(self.test_df)
        augmenter.fit(X, y)
        transformed_X = augmenter.transform(X.copy())

        fit_transformed_X = augmenter.fit_transform(X, y)

        self.assertEqual(
            sum(["pre_feature" == column for column in transformed_X.columns]),
            1)
        self.assertEqual(
            sum([
                "pre_feature" == column for column in fit_transformed_X.columns
            ]), 1)
Esempio n. 4
0
    def test_filter_only_tsfresh_features_false(self):
        """
        The boolean flag `filter_only_tsfresh_features` makes sure that only the time series based features are
        filtered. This unit tests checks that
        """

        augmenter = RelevantFeatureAugmenter(
            kind_to_fc_parameters=self.kind_to_fc_parameters,
            filter_only_tsfresh_features=False,
            column_value="val",
            column_id="id",
            column_sort="sort",
            column_kind="kind")

        df, y = self.create_test_data_sample_with_target()
        X = pd.DataFrame(index=np.unique(df.id))
        X["pre_drop"] = 0
        X["pre_keep"] = y

        augmenter.set_timeseries_container(df)
        augmenter.fit(X, y)
        transformed_X = augmenter.transform(X.copy())

        fit_transformed_X = augmenter.fit_transform(X, y)

        self.assertEqual(
            sum(["pre_keep" == column for column in transformed_X.columns]), 1)
        self.assertEqual(
            sum(["pre_drop" == column for column in transformed_X.columns]), 0)
        self.assertEqual(
            sum(["pre_keep" == column
                 for column in fit_transformed_X.columns]), 1)
        self.assertEqual(
            sum(["pre_drop" == column
                 for column in fit_transformed_X.columns]), 0)
    def test_no_timeseries(self):
        augmenter = RelevantFeatureAugmenter()

        X = pd.DataFrame()
        y = pd.Series()

        self.assertRaises(RuntimeError, augmenter.fit, X, y)
Esempio n. 6
0
def test_relevant_augmentor_cross_validated():
    """
    Validates that the RelevantFeatureAugmenter can be cloned in pipelines, see issue 537
    """
    n = 16  # number of samples, needs to be divisable by 4
    index = range(n)
    df_ts = pd.DataFrame({
        "time": [10, 11] * n,
        "id":
        np.repeat(index, 2),
        "value": [0, 1] * (n // 4) + [1, 2] * (n // 4) +  # class 0
        [10, 11] * (n // 4) + [12, 14] * (n // 4)
    })
    y = pd.Series(data=[0] * (n // 2) + [1] * (n // 2), index=index)
    X = pd.DataFrame(index=index)
    augmenter = RelevantFeatureAugmenter(
        column_id='id',
        column_sort='time',
        timeseries_container=df_ts,
        default_fc_parameters=MinimalFCParameters(),
        disable_progressbar=True,
        show_warnings=False,
        fdr_level=0.90)
    pipeline = Pipeline([('augmenter', augmenter),
                         ('classifier', RandomForestClassifier(random_state=1))
                         ])

    scores = model_selection.cross_val_score(pipeline, X, y, cv=2)
    assert (scores == np.array([1, 1])).all()
Esempio n. 7
0
    def test_no_timeseries(self):
        augmenter = RelevantFeatureAugmenter()

        X = pd.DataFrame()
        y = pd.Series(dtype="float64")

        self.assertRaises(RuntimeError, augmenter.fit, X, y)
        self.assertRaises(RuntimeError, augmenter.fit_transform, X, y)
Esempio n. 8
0
    def test_multiclass_selection(self):
        augmenter = RelevantFeatureAugmenter(
            column_value="val",
            column_id="id",
            column_sort="sort",
            column_kind="kind",
            multiclass=True,
            n_significant=3,
        )

        df, y = self.create_test_data_sample_with_multiclass_target()
        X = pd.DataFrame(index=np.unique(df.id))

        augmenter.set_timeseries_container(df)
        fit_transformed_X = augmenter.fit_transform(X, y)

        self.assertEqual(len(fit_transformed_X.columns), 4)
Esempio n. 9
0
    def test_nothing_relevant(self):
        augmenter = RelevantFeatureAugmenter(kind_to_fc_parameters=self.kind_to_fc_parameters,
                                             column_value="val", column_id="id", column_sort="sort",
                                             column_kind="kind")

        y = pd.Series({10: 1, 500: 0})
        X = pd.DataFrame(index=[10, 500])

        augmenter.set_timeseries_container(self.test_df)
        augmenter.fit(X, y)

        transformed_X = augmenter.transform(X.copy())

        self.assertEqual(list(transformed_X.columns), [])
        self.assertEqual(list(transformed_X.index), list(X.index))
    def test_does_impute(self, calculate_relevance_table_mock):
        df = pd.DataFrame([[1, 1, 1], [2, 1, 1]], columns=['id', 'time', 'value'])
        X = pd.DataFrame(index=[1])
        y = pd.Series([0, 1])
        fc_parameters = {"autocorrelation": [{'lag': 2}]}

        calculate_relevance_table_mock.return_value = pd.DataFrame(columns=['feature', 'p_value', 'relevant'])
        augmenter = RelevantFeatureAugmenter(column_id='id', column_sort='time', default_fc_parameters=fc_parameters)
        augmenter.set_timeseries_container(df)
        augmenter.fit(X, y)

        assert calculate_relevance_table_mock.call_count == 1
        assert not calculate_relevance_table_mock.call_args[0][0].isnull().any().any()
Esempio n. 11
0
    def test_impute_works(self):
        self.extraction_settings.kind_to_calculation_settings_mapping["a"].update({"kurtosis": None})

        augmeter = RelevantFeatureAugmenter(feature_extraction_settings=self.extraction_settings,
                                            column_value="val", column_id="id", column_sort="sort",
                                            column_kind="kind")

        y = pd.Series({10: 1, 500: 0})
        X = pd.DataFrame(index=[10, 500])

        augmeter.set_timeseries_container(self.test_df)
        augmeter.fit(X, y)

        transformed_X = augmeter.transform(X.copy())

        self.assertEqual(list(transformed_X.columns), [])
        self.assertEqual(list(transformed_X.index), list(X.index))
    def test_nothing_relevant(self):
        augmenter = RelevantFeatureAugmenter(
            feature_extraction_settings=self.extraction_settings,
            column_value="val",
            column_id="id",
            column_sort="sort",
            column_kind="kind")

        y = pd.Series({1: 1, 5: 0})
        X = pd.DataFrame(index=[1, 5])

        augmenter.set_timeseries_container(self.test_df)
        augmenter.fit(X, y)

        transformed_X = augmenter.transform(X.copy())

        self.assertEqual(list(transformed_X.columns), [])
        self.assertEqual(list(transformed_X.index), list(X.index))
    def test_nothing_relevant(self):
        augmenter = RelevantFeatureAugmenter(kind_to_fc_parameters=self.kind_to_fc_parameters,
                                             column_value="val", column_id="id", column_sort="sort",
                                             column_kind="kind")

        y = pd.Series({10: 1, 500: 0})
        X = pd.DataFrame(index=[10, 500])

        augmenter.set_timeseries_container(self.test_df)
        augmenter.fit(X, y)

        transformed_X = augmenter.transform(X.copy())

        self.assertEqual(list(transformed_X.columns), [])
        self.assertEqual(list(transformed_X.index), list(X.index))
    def test_evaluate_only_added_features_true(self):
        """
        The boolean flag `evaluate_only_extracted_features` makes sure that only the time series based features are
        filtered. This unit tests checks that
        """

        augmenter = RelevantFeatureAugmenter(kind_to_fc_parameters=self.kind_to_fc_parameters,
                                             filter_only_tsfresh_features=True,
                                             column_value="val", column_id="id", column_sort="sort", column_kind="kind")

        y = pd.Series({10: 1, 500: 0})
        X = pd.DataFrame(index=[10, 500])
        X["pre_feature"] = 0

        augmenter.set_timeseries_container(self.test_df)
        augmenter.fit(X, y)
        transformed_X = augmenter.transform(X.copy())

        self.assertEqual(sum(["pre_feature" == column for column in transformed_X.columns]), 1)
    def test_evaluate_only_added_features_false(self):
        """
        The boolean flag `evaluate_only_extracted_features` makes sure that only the time series based features are
        filtered. This unit tests checks that
        """

        augmenter = RelevantFeatureAugmenter(kind_to_fc_parameters=self.kind_to_fc_parameters,
                                             filter_only_tsfresh_features=False,
                                             column_value="val", column_id="id", column_sort="sort", column_kind="kind")

        df, y = self.create_test_data_sample_with_target()
        X = pd.DataFrame(index=np.unique(df.id))
        X["pre_drop"] = 0
        X["pre_keep"] = y

        augmenter.set_timeseries_container(df)
        augmenter.fit(X, y)
        transformed_X = augmenter.transform(X.copy())

        self.assertEqual(sum(["pre_keep" == column for column in transformed_X.columns]), 1)
        self.assertEqual(sum(["pre_drop" == column for column in transformed_X.columns]), 0)
    def test_not_fitted(self):
        augmenter = RelevantFeatureAugmenter()

        X = pd.DataFrame()

        self.assertRaises(RuntimeError, augmenter.transform, X)