def test_real_target_binary_features(self):
        # Mixed case with real target
        y = pd.Series(np.random.normal(0, 1, 1000))
        X = pd.DataFrame(index=range(1000))

        z = y - np.random.binomial(1, 0.20, 1000) + np.random.binomial(1, 0.20, 1000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel1"] = z

        z = y - np.random.binomial(1, 0.10, 1000) + np.random.binomial(1, 0.10, 1000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel2"] = z

        X["irr1"] = np.random.binomial(0, 0.1, 1000)
        X["irr2"] = np.random.binomial(0, 0.15, 1000)
        X["irr3"] = np.random.binomial(0, 0.05, 1000)
        X["irr4"] = np.random.binomial(0, 0.2, 1000)
        X["irr5"] = np.random.binomial(0, 0.25, 1000)
        X["irr6"] = np.random.binomial(0, 0.01, 1000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2'])

        self.assertGreater(len(feat_rej), 0)
Ejemplo n.º 2
0
    def test_constant_feature_irrelevant(self, y_binary):
        X = pd.DataFrame([1, 1, 1], columns=['feature_binary'])

        relevance_table = calculate_relevance_table(X, y_binary)
        assert "feature_binary" == relevance_table.index[0]
        assert 'constant' == relevance_table.type[0]
        assert np.isnan(relevance_table.p_value[0])
        assert False == relevance_table.relevant[0]
    def test_binary_target_binary_features(self):
        # Binomial random variables and binomial target
        y = pd.Series(np.random.binomial(1, 0.5, 5000))
        X = pd.DataFrame(index=range(5000))

        for i in range(10):
            X["irr{}".format(i)] = np.random.binomial(1, 0.1, 5000)

        for i in range(10, 20):
            X["irr{}".format(i)] = np.random.binomial(1, 0.8, 5000)

        z = y - np.random.binomial(1, 0.01, 5000) + np.random.binomial(1, 0.01, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel1"] = z

        z = y - np.random.binomial(1, 0.05, 5000) + np.random.binomial(1, 0.05, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel2"] = z

        z = y - np.random.binomial(1, 0.10, 5000) + np.random.binomial(1, 0.10, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel3"] = z

        z = y - np.random.binomial(1, 0.15, 5000) + np.random.binomial(1, 0.15, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel4"] = z

        z = y - np.random.binomial(1, 0.20, 5000) + np.random.binomial(1, 0.20, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel5"] = z

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4', 'rel5'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 6):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            self.assertEqual(row.type, "binary")

        for i in range(1, 20):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            self.assertEqual(row.type, "binary")

            self.assertEqual(row.relevant, False)
    def test_binary_target_mixed_case(self):
        # Mixed case with binomial target
        np.random.seed(42)
        y = pd.Series(np.random.binomial(1, 0.5, 1000))
        X = pd.DataFrame(index=range(1000))

        z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(1, 0.1, 1000)
        z[z == -1] = 0
        z[z == 2] = 1

        X["rel1"] = z
        X["rel2"] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 1, 1000)
        X["rel3"] = y + np.random.normal(0, 0.3, 1000)
        X["rel4"] = y ** 2 + np.random.normal(0, 1, 1000)
        X["rel5"] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000)

        X["irr_constant"] = 1.113344

        X["irr1"] = np.random.normal(0, 1, 1000)
        X["irr2"] = np.random.poisson(1, 1000)
        X["irr3"] = np.random.binomial(1, 0.3, 1000)
        X["irr4"] = np.random.normal(0, 1, 1000)
        X["irr5"] = np.random.poisson(1, 1000)
        X["irr6"] = np.random.binomial(1, 0.3, 1000)
        X["irr7"] = np.random.normal(0, 1, 1000)
        X["irr8"] = np.random.poisson(1, 1000)
        X["irr9"] = np.random.binomial(1, 0.3, 1000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4', 'rel5'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 6):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            if i == 1:
                self.assertEqual(row.type, "binary")
            else:
                self.assertEqual(row.type, "real")

        for i in range(1, 10):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            if i not in [3, 6, 9]:
                self.assertEqual(row.type, "real")
            else:
                self.assertEqual(row.type, "binary")

            self.assertEqual(row.relevant, False)
Ejemplo n.º 5
0
    def test_target_binary_calls_correct_tests(
            self, significance_test_feature_binary_mock,
            significance_test_feature_real_mock, X, y_binary):
        significance_test_feature_binary_mock.return_value = 0.5
        significance_test_feature_real_mock.return_value = 0.7
        relevance_table = calculate_relevance_table(X, y_binary, n_jobs=0)

        assert 0.5 == relevance_table.loc['feature_binary'].p_value
        assert 0.7 == relevance_table.loc['feature_real'].p_value
        assert 2 == significance_test_feature_binary_mock.call_count
        assert 2 == significance_test_feature_real_mock.call_count
    def test_real_target_mixed_case(self):
        # Mixed case with real target
        y = pd.Series(np.random.normal(0, 1, 5000))
        X = pd.DataFrame(index=range(5000))

        z = y.copy()
        z[z <= 0] = 0
        z[z > 0] = 1

        X["rel1"] = z
        X["rel2"] = y
        X["rel3"] = y ** 2
        X["rel4"] = np.sqrt(abs(y))

        X["irr1"] = np.random.normal(0, 1, 5000)
        X["irr2"] = np.random.poisson(1, 5000)
        X["irr3"] = np.random.binomial(1, 0.1, 5000)
        X["irr4"] = np.random.normal(0, 1, 5000)
        X["irr5"] = np.random.poisson(1, 5000)
        X["irr6"] = np.random.binomial(1, 0.05, 5000)
        X["irr7"] = np.random.normal(0, 1, 5000)
        X["irr8"] = np.random.poisson(1, 5000)
        X["irr9"] = np.random.binomial(1, 0.2, 5000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 5):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            if i == 1:
                self.assertEqual(row.type, "binary")
            else:
                self.assertEqual(row.type, "real")

        for i in range(1, 10):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            if i in [3, 6, 9]:
                self.assertEqual(row.type, "binary")
            else:
                self.assertEqual(row.type, "real")

            self.assertEqual(row.relevant, False)
Ejemplo n.º 7
0
    def test_target_real_calls_correct_tests(
            self, significance_test_feature_binary_mock,
            significance_test_feature_real_mock, X, y_real):
        significance_test_feature_binary_mock.return_value = 0.5
        significance_test_feature_real_mock.return_value = 0.7

        relevance_table = calculate_relevance_table(X, y_real, n_jobs=0)

        assert 0.5 == relevance_table.loc['feature_binary'].p_value
        assert 0.7 == relevance_table.loc['feature_real'].p_value
        significance_test_feature_binary_mock.assert_called_once_with(
            X['feature_binary'], y=y_real)
        significance_test_feature_real_mock.assert_called_once_with(
            X['feature_real'], y=y_real)
Ejemplo n.º 8
0
    def test_warning_for_no_relevant_feature(
            self, significance_test_feature_binary_mock,
            significance_test_feature_real_mock, X, y_real):
        significance_test_feature_binary_mock.return_value = 0.95
        significance_test_feature_real_mock.return_value = 0.95

        with mock.patch('logging.Logger.warning') as m:
            relevance_table = calculate_relevance_table(X,
                                                        y_real,
                                                        n_jobs=0,
                                                        ml_task="regression")
            m.assert_called_with(
                'No feature was found relevant for regression for fdr level = 0.05. '
                'Consider using a lower fdr level or other features.')
    def test_all_features_bad(self):
        # Mixed case with real target
        y = pd.Series(np.random.normal(0, 1, 1000))
        X = pd.DataFrame(index=range(1000))

        X["irr1"] = np.random.binomial(0, 0.1, 1000)
        X["irr2"] = np.random.binomial(0, 0.15, 1000)
        X["irr3"] = np.random.binomial(0, 0.05, 1000)
        X["irr4"] = np.random.binomial(0, 0.2, 1000)
        X["irr5"] = np.random.binomial(0, 0.25, 1000)
        X["irr6"] = np.random.binomial(0, 0.01, 1000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        self.assertEqual(len(feat_rej), 0)
Ejemplo n.º 10
0
    def fit(self, X, y):
        """
        Extract the information, which of the features are relevent using the given target.

        For more information, please see the :func:`~tsfresh.festure_selection.festure_selector.check_fs_sig_bh`
        function. All columns in the input data sample are treated as feature. The index of all
        rows in X must be present in y.

        :param X: data sample with the features, which will be classified as relevant or not
        :type X: pandas.DataFrame or numpy.array

        :param y: target vector to be used, to classify the features
        :type y: pandas.Series or numpy.array

        :return: the fitted estimator with the information, which features are relevant
        :rtype: FeatureSelector
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X.copy())

        if not isinstance(y, pd.Series):
            y = pd.Series(y.copy())

        relevance_table = calculate_relevance_table(
            X,
            y,
            ml_task=self.ml_task,
            n_jobs=self.n_jobs,
            chunksize=self.chunksize,
            fdr_level=self.fdr_level,
            hypotheses_independent=self.hypotheses_independent,
            test_for_binary_target_real_feature=self.
            test_for_binary_target_real_feature)
        self.relevant_features = relevance_table.loc[
            relevance_table.relevant].feature.tolist()
        self.feature_importances_ = 1.0 - relevance_table.p_value.values
        self.p_values = relevance_table.p_value.values
        self.features = relevance_table.index.tolist()

        return self
    def test_binomial_target_realvalued_features(self):
        # Real valued random variables and binomial target
        y = pd.Series(np.random.binomial(1, 0.5, 5000))
        X = pd.DataFrame(index=range(5000))

        for i in range(10):
            X["irr{}".format(i)] = np.random.normal(1, 0.3, 5000)

        for i in range(10, 20):
            X["irr{}".format(i)] = np.random.normal(1, 0.5, 5000)

        for i in range(20, 30):
            X["irr{}".format(i)] = np.random.normal(1, 0.8, 5000)

        X["rel1"] = y * np.random.normal(0, 1, 5000) + np.random.normal(0, 1, 5000)
        X["rel2"] = y + np.random.normal(0, 1, 5000)
        X["rel3"] = y ** 2 + np.random.normal(0, 1, 5000)
        X["rel4"] = np.sqrt(y) + np.random.binomial(2, 0.1, 5000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 5):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            self.assertEqual(row.type, "real")

        for i in range(1, 30):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            self.assertEqual(row.type, "real")

            self.assertEqual(row.relevant, False)
Ejemplo n.º 12
0
 def test_restrict_ml_task_options(self, X, y_binary):
     with pytest.raises(ValueError):
         calculate_relevance_table(X, y_binary, ml_task='some_other_task')
Ejemplo n.º 13
0
def select_features(X, y, test_for_binary_target_binary_feature=defaults.TEST_FOR_BINARY_TARGET_BINARY_FEATURE,
                    test_for_binary_target_real_feature=defaults.TEST_FOR_BINARY_TARGET_REAL_FEATURE,
                    test_for_real_target_binary_feature=defaults.TEST_FOR_REAL_TARGET_BINARY_FEATURE,
                    test_for_real_target_real_feature=defaults.TEST_FOR_REAL_TARGET_REAL_FEATURE,
                    fdr_level=defaults.FDR_LEVEL, hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT,
                    n_jobs=defaults.N_PROCESSES, chunksize=defaults.CHUNKSIZE,
                    ml_task='auto'):
    """
    Check the significance of all features (columns) of feature matrix X and return a possibly reduced feature matrix
    only containing relevant features.

    The feature matrix must be a pandas.DataFrame in the format:

        +-------+-----------+-----------+-----+-----------+
        | index | feature_1 | feature_2 | ... | feature_N |
        +=======+===========+===========+=====+===========+
        | A     | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | B     | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | ...   | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | ...   | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | ...   | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+


    Each column will be handled as a feature and tested for its significance to the target.

    The target vector must be a pandas.Series or numpy.array in the form

        +-------+--------+
        | index | target |
        +=======+========+
        | A     | ...    |
        +-------+--------+
        | B     | ...    |
        +-------+--------+
        | .     | ...    |
        +-------+--------+
        | .     | ...    |
        +-------+--------+

    and must contain all id's that are in the feature matrix. If y is a numpy.array without index, it is assumed
    that y has the same order and length than X and the rows correspond to each other.

    Examples
    ========

    >>> from pai_tsfresh.examples import load_robot_execution_failures
    >>> from pai_tsfresh import extract_features, select_features
    >>> df, y = load_robot_execution_failures()
    >>> X_extracted = extract_features(df, column_id='id', column_sort='time')
    >>> X_selected = select_features(X_extracted, y)

    :param X: Feature matrix in the format mentioned before which will be reduced to only the relevant features.
              It can contain both binary or real-valued features at the same time.
    :type X: pandas.DataFrame

    :param y: Target vector which is needed to test which features are relevant. Can be binary or real-valued.
    :type y: pandas.Series or numpy.ndarray

    :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused)
    :type test_for_binary_target_binary_feature: str

    :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature
    :type test_for_binary_target_real_feature: str

    :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused)
    :type test_for_real_target_binary_feature: str

    :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused)
    :type test_for_real_target_real_feature: str

    :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant
                      features among all created features.
    :type fdr_level: float

    :param hypotheses_independent: Can the significance of the features be assumed to be independent?
                                   Normally, this should be set to False as the features are never
                                   independent (e.g. mean and median)
    :type hypotheses_independent: bool

    :param n_jobs: Number of processes to use during the p-value calculation
    :type n_jobs: int

    :param chunksize: The size of one chunk that is submitted to the worker
        process for the parallelisation.  Where one chunk is defined as a
        singular time series for one id and one kind. If you set the chunksize
        to 10, then it means that one task is to calculate all features for 10
        time series.  If it is set it to None, depending on distributor,
        heuristics are used to find the optimal chunksize. If you get out of
        memory exceptions, you can try it with the dask distributor and a
        smaller chunksize.
    :type chunksize: None or int

    :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`.
                    Defaults to `'auto'`, meaning the intended task is inferred from `y`.
                    If `y` has a boolean, integer or object dtype, the task is assumend to be classification,
                    else regression.
    :type ml_task: str

    :return: The same DataFrame as X, but possibly with reduced number of columns ( = features).
    :rtype: pandas.DataFrame

    :raises: ``ValueError`` when the target vector does not fit to the feature matrix
             or `ml_task` is not one of `'auto'`, `'classification'` or `'regression'`.
    """
    check_for_nans_in_columns(X)

    if not isinstance(y, (pd.Series, np.ndarray)):
        raise TypeError("The type of target vector y must be one of: pandas.Series, numpy.ndarray")

    if len(X) < 2:
        raise ValueError("X must contain at least two samples.")
    elif len(set(y)) == 1:
        raise ValueError("y contains only one kind of label, no feature selection possible.")
    elif isinstance(y, pd.Series) and not X.index.isin(y.index).all():
        raise ValueError("Index of X must be a subset of y's index")
    elif isinstance(y, np.ndarray):
        if not len(y) >= len(X):
            raise ValueError("Target vector y is shorter than feature matrix X")

        y = pd.Series(y, index=X.index)

    relevance_table = calculate_relevance_table(
        X, y, ml_task=ml_task, n_jobs=n_jobs, chunksize=chunksize,
        test_for_binary_target_real_feature=test_for_binary_target_real_feature,
        fdr_level=fdr_level, hypotheses_independent=hypotheses_independent,
    )

    relevant_features = relevance_table[relevance_table.relevant].feature

    return X.loc[:, relevant_features]