Example #1
0
 def test_multiclass_n_significant_error(self, X, y_binary):
     with pytest.raises(AssertionError):
         calculate_relevance_table(X,
                                   y_binary,
                                   multiclass=True,
                                   n_significant=3,
                                   ml_task="classification")
    def test_real_target_binary_features(self):
        # Mixed case with real target
        y = pd.Series(np.random.normal(0, 1, 1000))
        X = pd.DataFrame(index=range(1000))

        z = y - np.random.binomial(1, 0.20, 1000) + np.random.binomial(1, 0.20, 1000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel1"] = z

        z = y - np.random.binomial(1, 0.10, 1000) + np.random.binomial(1, 0.10, 1000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel2"] = z

        X["irr1"] = np.random.binomial(0, 0.1, 1000)
        X["irr2"] = np.random.binomial(0, 0.15, 1000)
        X["irr3"] = np.random.binomial(0, 0.05, 1000)
        X["irr4"] = np.random.binomial(0, 0.2, 1000)
        X["irr5"] = np.random.binomial(0, 0.25, 1000)
        X["irr6"] = np.random.binomial(0, 0.01, 1000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2'])

        self.assertGreater(len(feat_rej), 0)
Example #3
0
    def test_target_real_calls_correct_tests(
        self,
        significance_test_feature_binary_mock,
        significance_test_feature_real_mock,
        X,
        y_real,
    ):
        significance_test_feature_binary_mock.return_value = 0.5
        significance_test_feature_real_mock.return_value = 0.7

        relevance_table = calculate_relevance_table(X, y_real, n_jobs=0)

        assert 0.5 == relevance_table.loc["feature_binary"].p_value
        assert 0.7 == relevance_table.loc["feature_real"].p_value

        assert significance_test_feature_binary_mock.call_count == 1
        pd.testing.assert_series_equal(
            significance_test_feature_binary_mock.call_args[0][0],
            X["feature_binary"])
        pd.testing.assert_series_equal(
            significance_test_feature_binary_mock.call_args[1]["y"], y_real)

        assert significance_test_feature_real_mock.call_count == 1
        pd.testing.assert_series_equal(
            significance_test_feature_real_mock.call_args[0][0],
            X["feature_real"])
        pd.testing.assert_series_equal(
            significance_test_feature_real_mock.call_args[1]["y"], y_real)
Example #4
0
    def test_multiclass_correct_features_relevant(self, y_multi):
        X_multi = pd.DataFrame()
        X_multi["relevant_0"] = np.concatenate(
            [np.zeros(298), np.array([0.01, -0.01])])
        X_multi["relevant_3"] = X_multi["relevant_0"].copy()
        X_multi["relevant_3"][y_multi == 0] = np.random.uniform(2, 3, 100)
        X_multi["relevant_2"] = X_multi["relevant_3"].copy()
        X_multi["relevant_2"][y_multi == 1] = np.random.uniform(-2, -1, 100)

        relevance_table = calculate_relevance_table(X_multi,
                                                    y_multi,
                                                    multiclass=True,
                                                    ml_task="classification",
                                                    n_significant=3)
        assert relevance_table.loc["relevant_3", "relevant"]
        assert not relevance_table.loc["relevant_2", "relevant"]
        assert not relevance_table.loc["relevant_0", "relevant"]

        # the distributions of all 3 classes under a one vs. rest scheme will be separated enough for
        # this feature to be relevant for predicting 3 classes
        assert relevance_table.loc["relevant_3", "n_significant"] == 3

        # due to the distribution of this feature where y_multi == 0 being contained inside the range of
        # y_multi != 0 it will not pass the Mann-Whitney U test under a one vs. rest scheme for that class
        assert relevance_table.loc["relevant_2", "n_significant"] == 2
        assert relevance_table.loc["relevant_0", "n_significant"] == 0
Example #5
0
    def fit(self, X, y):
        """
        Extract the information, which of the features are relevent using the given target.

        For more information, please see the :func:`~tsfresh.festure_selection.festure_selector.check_fs_sig_bh`
        function. All columns in the input data sample are treated as feature. The index of all
        rows in X must be present in y.

        :param X: data sample with the features, which will be classified as relevant or not
        :type X: pandas.DataFrame or numpy.array

        :param y: target vector to be used, to classify the features
        :type y: pandas.Series or numpy.array

        :return: the fitted estimator with the information, which features are relevant
        :rtype: FeatureSelector
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X.copy())

        if not isinstance(y, pd.Series):
            y = pd.Series(y.copy())

        relevance_table = calculate_relevance_table(
                                X, y, ml_task=self.ml_task, n_jobs=self.n_jobs,
                                chunksize=self.chunksize, fdr_level=self.fdr_level,
                                hypotheses_independent=self.hypotheses_independent,
                                test_for_binary_target_real_feature=self.test_for_binary_target_real_feature)
        self.relevant_features = relevance_table.loc[relevance_table.relevant].feature.tolist()
        self.feature_importances_ = 1.0 - relevance_table.p_value.values
        self.p_values = relevance_table.p_value.values
        self.features = relevance_table.index.tolist()

        return self
    def test_all_features_good(self):
        # Mixed case with real target
        y = pd.Series(np.random.normal(0, 1, 1000))
        X = pd.DataFrame(index=range(1000))

        z = y - np.random.binomial(1, 0.20, 1000) + np.random.binomial(
            1, 0.20, 1000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel1"] = z

        z = y - np.random.binomial(1, 0.10, 1000) + np.random.binomial(
            1, 0.10, 1000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel2"] = z

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2'])

        self.assertGreater(len(feat_rej), 0)
Example #7
0
def calculate_feature_relevance(app_dir, valid_features_list, df_labels):
    excel_writer1 = pandas.ExcelWriter(os.path.join(app_dir, 'feature_p_value.xlsx'))
    excel_writer2 = pandas.ExcelWriter(os.path.join(app_dir, 'feature_kendall.xlsx'))
    sheet_count = 0
    for df_valid_features in valid_features_list:  # one sheet of features
        # calculate p-value
        print(sheet_count)
        df_p_value = calculate_relevance_table(df_valid_features, df_labels['label'], ml_task='classification')
        df_p_value.to_excel(excel_writer1, '%d' % sheet_count)

        # arrange data format
        df_valid_features.reset_index(inplace=True)
        # print(df_valid_features.columns)
        filtered_feature = {'id', }
        for _, row in df_p_value.iterrows():
            if row['relevant']:
                filtered_feature.add(row['feature'])
        filtered_by_p = df_valid_features[list(filtered_feature)]
        # print(filtered_by_p[list(filtered_feature)[-1]])
        # print(filtered_by_p[list(filtered_feature)[-2]])
        # print(filtered_by_p['id'])
        # print(df_labels['id'])
        feature_with_label = pandas.merge(filtered_by_p, df_labels, on='id').drop('id', 1)
        # print(feature_with_label)
        # sys.exit(0)

        # calculate kendall correlation coefficient
        feature_kendall = feature_with_label.corr('kendall')['label'].drop('label', 0).to_frame()
        feature_kendall.rename(columns={'label': 'kendall_correlation_coefficient'}, inplace=True)
        df_kendall = pandas.DataFrame(feature_kendall)
        df_kendall.to_excel(excel_writer2, '%d' % sheet_count)

        sheet_count += 1
    excel_writer1.save()
    excel_writer2.save()
Example #8
0
    def test_constant_feature_irrelevant(self, y_binary):
        X = pd.DataFrame([1, 1, 1], columns=['feature_binary'])

        relevance_table = calculate_relevance_table(X, y_binary)
        assert "feature_binary" == relevance_table.index[0]
        assert 'constant' == relevance_table.type[0]
        assert np.isnan(relevance_table.p_value[0])
        assert False == relevance_table.relevant[0]
Example #9
0
    def test_constant_feature_irrelevant(self, y_binary):
        X = pd.DataFrame([1, 1, 1], columns=['feature_binary'])

        relevance_table = calculate_relevance_table(X, y_binary)
        assert "feature_binary" == relevance_table.index[0]
        assert 'constant' == relevance_table.type[0]
        assert np.isnan(relevance_table.p_value[0])
        assert False == relevance_table.relevant[0]
    def test_binary_target_mixed_case(self):
        # Mixed case with binomial target
        np.random.seed(42)
        y = pd.Series(np.random.binomial(1, 0.5, 1000))
        X = pd.DataFrame(index=range(1000))

        z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(
            1, 0.1, 1000)
        z[z == -1] = 0
        z[z == 2] = 1

        X["rel1"] = z
        X["rel2"] = y * np.abs(np.random.normal(
            0, 1, 1000)) + np.random.normal(0, 1, 1000)
        X["rel3"] = y + np.random.normal(0, 0.3, 1000)
        X["rel4"] = y**2 + np.random.normal(0, 1, 1000)
        X["rel5"] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000)

        X["irr_constant"] = 1.113344

        X["irr1"] = np.random.normal(0, 1, 1000)
        X["irr2"] = np.random.poisson(1, 1000)
        X["irr3"] = np.random.binomial(1, 0.3, 1000)
        X["irr4"] = np.random.normal(0, 1, 1000)
        X["irr5"] = np.random.poisson(1, 1000)
        X["irr6"] = np.random.binomial(1, 0.3, 1000)
        X["irr7"] = np.random.normal(0, 1, 1000)
        X["irr8"] = np.random.poisson(1, 1000)
        X["irr9"] = np.random.binomial(1, 0.3, 1000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature,
                          ['rel1', 'rel2', 'rel3', 'rel4', 'rel5'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 6):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            if i == 1:
                self.assertEqual(row.type, "binary")
            else:
                self.assertEqual(row.type, "real")

        for i in range(1, 10):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            if i not in [3, 6, 9]:
                self.assertEqual(row.type, "real")
            else:
                self.assertEqual(row.type, "binary")

            self.assertEqual(row.relevant, False)
    def test_binary_target_binary_features(self):
        # Binomial random variables and binomial target
        y = pd.Series(np.random.binomial(1, 0.5, 5000))
        X = pd.DataFrame(index=range(5000))

        for i in range(10):
            X["irr{}".format(i)] = np.random.binomial(1, 0.1, 5000)

        for i in range(10, 20):
            X["irr{}".format(i)] = np.random.binomial(1, 0.8, 5000)

        z = y - np.random.binomial(1, 0.01, 5000) + np.random.binomial(1, 0.01, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel1"] = z

        z = y - np.random.binomial(1, 0.05, 5000) + np.random.binomial(1, 0.05, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel2"] = z

        z = y - np.random.binomial(1, 0.10, 5000) + np.random.binomial(1, 0.10, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel3"] = z

        z = y - np.random.binomial(1, 0.15, 5000) + np.random.binomial(1, 0.15, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel4"] = z

        z = y - np.random.binomial(1, 0.20, 5000) + np.random.binomial(1, 0.20, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel5"] = z

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4', 'rel5'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 6):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            self.assertEqual(row.type, "binary")

        for i in range(1, 20):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            self.assertEqual(row.type, "binary")

            self.assertEqual(row.relevant, False)
Example #12
0
    def test_warning_for_no_relevant_feature(self, significance_test_feature_binary_mock,
                                             significance_test_feature_real_mock, X, y_real):
        significance_test_feature_binary_mock.return_value = 0.95
        significance_test_feature_real_mock.return_value = 0.95

        with mock.patch('logging.Logger.warning') as m:
            relevance_table = calculate_relevance_table(X, y_real, n_jobs=0, ml_task="regression")
            m.assert_called_with('No feature was found relevant for regression for fdr level = 0.05. '
                                 'Consider using a lower fdr level or other features.')
Example #13
0
    def select_relevant_features(self, X, y):
        '''Select statistically significant features while computing the relevance of these features.'''
        # calculate relevance tables for each binary class pair
        relevance_tables = list()
        for label in np.unique(y):
            y_binary = (y == label)
            relevance_tables.append(
                (label,
                 calculate_relevance_table(X,
                                           y_binary,
                                           fdr_level=self.fdr_level,
                                           n_jobs=self.n_jobs)))

        # concatenate relevance tables
        relevance_table_concat = pd.concat(
            [table for (lable, table) in relevance_tables])

        # perform benjamini hochberg test
        relevance_table_benjamini = benjamini_hochberg_test(
            relevance_table_concat,
            hypotheses_independent=False,
            fdr_level=self.fdr_level)

        # remove irrelevant features from the table
        relevance_table_benjamini = relevance_table_benjamini[
            relevance_table_benjamini.relevant == True]

        # select features occurred at least twice in the table
        feature_occurrences = relevance_table_benjamini.feature.value_counts()
        relevant_features = feature_occurrences[feature_occurrences == len(
            y.unique())].index.values
        occurrence_counts = feature_occurrences.value_counts()
        for i in range(1, 4):
            try:
                logging.info(
                    'Number of features occurred {} time(s) in the relevant features selected after benjamini hochberg test: {}'
                    .format(i, occurrence_counts[i]))
            except (
                    KeyError, IndexError
            ):  # when there is no feature occur the corresponding number of times
                pass
        # build final relevance table
        relevance_table_final = pd.DataFrame({
            'feature':
            relevant_features,
            'p_value': [
                relevance_table_benjamini.loc[f].p_value.max()
                for f in relevant_features
            ],
            'occurrence': [feature_occurrences[f] for f in relevant_features]
        }).sort_values(by=['p_value', 'occurrence']).reset_index(drop=True)
        logging.info(
            "Number of relevant features for all classes: {}/{}".format(
                relevance_table_final.shape[0], X.shape[1]))

        return relevance_table_final
Example #14
0
    def test_warning_for_no_relevant_feature(self, significance_test_feature_binary_mock,
                                             significance_test_feature_real_mock, X, y_real):
        significance_test_feature_binary_mock.return_value = 0.95
        significance_test_feature_real_mock.return_value = 0.95

        with mock.patch('logging.Logger.warning') as m:
            _ = calculate_relevance_table(X, y_real, n_jobs=0, ml_task="regression")
            m.assert_called_with("No feature was found relevant for regression for fdr level = 0.05 (which corresponds " 
                                 "to the maximal percentage of irrelevant features, consider using an higher fdr level "
                                 "or add other features.")
Example #15
0
    def test_target_binary_calls_correct_tests(self, significance_test_feature_binary_mock,
                                         significance_test_feature_real_mock, X, y_binary):
        significance_test_feature_binary_mock.return_value = 0.5
        significance_test_feature_real_mock.return_value = 0.7
        relevance_table = calculate_relevance_table(X, y_binary, n_jobs=0)

        assert 0.5 == relevance_table.loc['feature_binary'].p_value
        assert 0.7 == relevance_table.loc['feature_real'].p_value
        assert 2 == significance_test_feature_binary_mock.call_count
        assert 2 == significance_test_feature_real_mock.call_count
Example #16
0
    def test_target_binary_calls_correct_tests(self, significance_test_feature_binary_mock,
                                         significance_test_feature_real_mock, X, y_binary):
        significance_test_feature_binary_mock.return_value = 0.5
        significance_test_feature_real_mock.return_value = 0.7
        relevance_table = calculate_relevance_table(X, y_binary, n_jobs=0)

        assert 0.5 == relevance_table.loc['feature_binary'].p_value
        assert 0.7 == relevance_table.loc['feature_real'].p_value
        assert 2 == significance_test_feature_binary_mock.call_count
        assert 2 == significance_test_feature_real_mock.call_count
    def test_binary_target_mixed_case(self):
        # Mixed case with binomial target
        np.random.seed(42)
        y = pd.Series(np.random.binomial(1, 0.5, 1000))
        X = pd.DataFrame(index=range(1000))

        z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(1, 0.1, 1000)
        z[z == -1] = 0
        z[z == 2] = 1

        X["rel1"] = z
        X["rel2"] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 1, 1000)
        X["rel3"] = y + np.random.normal(0, 0.3, 1000)
        X["rel4"] = y ** 2 + np.random.normal(0, 1, 1000)
        X["rel5"] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000)

        X["irr_constant"] = 1.113344

        X["irr1"] = np.random.normal(0, 1, 1000)
        X["irr2"] = np.random.poisson(1, 1000)
        X["irr3"] = np.random.binomial(1, 0.3, 1000)
        X["irr4"] = np.random.normal(0, 1, 1000)
        X["irr5"] = np.random.poisson(1, 1000)
        X["irr6"] = np.random.binomial(1, 0.3, 1000)
        X["irr7"] = np.random.normal(0, 1, 1000)
        X["irr8"] = np.random.poisson(1, 1000)
        X["irr9"] = np.random.binomial(1, 0.3, 1000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4', 'rel5'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 6):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            if i == 1:
                self.assertEqual(row.type, "binary")
            else:
                self.assertEqual(row.type, "real")

        for i in range(1, 10):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            if i not in [3, 6, 9]:
                self.assertEqual(row.type, "real")
            else:
                self.assertEqual(row.type, "binary")

            self.assertEqual(row.relevant, False)
Example #18
0
    def test_target_real_calls_correct_tests(self, significance_test_feature_binary_mock,
                                         significance_test_feature_real_mock, X, y_real):
        significance_test_feature_binary_mock.return_value = 0.5
        significance_test_feature_real_mock.return_value = 0.7

        relevance_table = calculate_relevance_table(X, y_real, n_jobs=0)

        assert 0.5 == relevance_table.loc['feature_binary'].p_value
        assert 0.7 == relevance_table.loc['feature_real'].p_value
        significance_test_feature_binary_mock.assert_called_once_with(X['feature_binary'], y=y_real)
        significance_test_feature_real_mock.assert_called_once_with(X['feature_real'], y=y_real)
Example #19
0
    def test_target_real_calls_correct_tests(self, significance_test_feature_binary_mock,
                                         significance_test_feature_real_mock, X, y_real):
        significance_test_feature_binary_mock.return_value = 0.5
        significance_test_feature_real_mock.return_value = 0.7

        relevance_table = calculate_relevance_table(X, y_real, n_jobs=0)

        assert 0.5 == relevance_table.loc['feature_binary'].p_value
        assert 0.7 == relevance_table.loc['feature_real'].p_value
        significance_test_feature_binary_mock.assert_called_once_with(X['feature_binary'], y=y_real)
        significance_test_feature_real_mock.assert_called_once_with(X['feature_real'], y=y_real)
def perform_fresh_pca_after(X_train, y_train, X_test, y_test):
  log('Processing fresh_pca_after')
  fresh_train_X, fresh_train_y = raw_to_tsfresh(X_train, y_train)
  fresh_test_X, fresh_test_y = raw_to_tsfresh(X_test, y_test)

  # Run the feature extraction and relevance tests ONLY on the train
  # data set.
  extracted_train = extract_features(fresh_train_X, column_id='id', column_value='value')

  # For some reason, tsfresh is extracting features that contain Nan,
  # Infinity or None.  This breaks the PCA step.  To avoid this, we
  # drop columns that contain these values. I know of nothing else to do here.
  extracted_train = extracted_train.dropna(axis='columns')

  filtered_train = None
  # execute at different fdr levels to try to make FRESH more robust
  for fdr in [0.05, 0.01, 0.005, 0.001]:
      R = calculate_relevance_table(extracted_train, y_train.squeeze(), fdr_level=fdr)
      filtered_train = filter_features(extracted_train, R)
      if (filtered_train.shape[1] > 0):
          break

  # Perform PCA on the filtered set of features
  pca_train = PCAForPandas(n_components=0.95, svd_solver='full')
  filtered_train = pca_train.fit_transform(filtered_train)

  # Extract features from the test set, but then apply the same relevant
  # features that we used from the train set
  extracted_test = extract_features(fresh_test_X, column_id='id', column_value='value')
  extracted_test = extracted_test.dropna(axis='columns')

  filtered_test = filter_features(extracted_test, R)

  filtered_test = pca_train.transform(filtered_test)

  # Train classifiers on the train set
  clf = build_rfc()
  trained_model = clf.fit(filtered_train, y_train.squeeze())
  rfc_predicted = list(map(lambda v: int(v), clf.predict(filtered_test)))

  actual = y_test.squeeze().tolist()

  # Create and fit an AdaBoosted decision tree
  bdt = build_ada()
  trained_model = bdt.fit(filtered_train, y_train.squeeze())
  ada_predicted = list(map(lambda v: int(v), bdt.predict(filtered_test)))

  return {
    'rfc':  accuracy_rate(rfc_predicted, actual),
    'ada': accuracy_rate(ada_predicted, actual),
    'rfc_count': len(clf.estimators_),
    'ada_count': len(bdt.estimators_),
  }
    def test_real_target_mixed_case(self):
        # Mixed case with real target
        y = pd.Series(np.random.normal(0, 1, 5000))
        X = pd.DataFrame(index=range(5000))

        z = y.copy()
        z[z <= 0] = 0
        z[z > 0] = 1

        X["rel1"] = z
        X["rel2"] = y
        X["rel3"] = y ** 2
        X["rel4"] = np.sqrt(abs(y))

        X["irr1"] = np.random.normal(0, 1, 5000)
        X["irr2"] = np.random.poisson(1, 5000)
        X["irr3"] = np.random.binomial(1, 0.1, 5000)
        X["irr4"] = np.random.normal(0, 1, 5000)
        X["irr5"] = np.random.poisson(1, 5000)
        X["irr6"] = np.random.binomial(1, 0.05, 5000)
        X["irr7"] = np.random.normal(0, 1, 5000)
        X["irr8"] = np.random.poisson(1, 5000)
        X["irr9"] = np.random.binomial(1, 0.2, 5000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 5):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            if i == 1:
                self.assertEqual(row.type, "binary")
            else:
                self.assertEqual(row.type, "real")

        for i in range(1, 10):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            if i in [3, 6, 9]:
                self.assertEqual(row.type, "binary")
            else:
                self.assertEqual(row.type, "real")

            self.assertEqual(row.relevant, False)
    def test_real_target_mixed_case(self):
        # Mixed case with real target
        y = pd.Series(np.random.normal(0, 1, 5000))
        X = pd.DataFrame(index=range(5000))

        z = y.copy()
        z[z <= 0] = 0
        z[z > 0] = 1

        X["rel1"] = z
        X["rel2"] = y
        X["rel3"] = y**2
        X["rel4"] = np.sqrt(abs(y))

        X["irr1"] = np.random.normal(0, 1, 5000)
        X["irr2"] = np.random.poisson(1, 5000)
        X["irr3"] = np.random.binomial(1, 0.1, 5000)
        X["irr4"] = np.random.normal(0, 1, 5000)
        X["irr5"] = np.random.poisson(1, 5000)
        X["irr6"] = np.random.binomial(1, 0.05, 5000)
        X["irr7"] = np.random.normal(0, 1, 5000)
        X["irr8"] = np.random.poisson(1, 5000)
        X["irr9"] = np.random.binomial(1, 0.2, 5000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 5):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            if i == 1:
                self.assertEqual(row.type, "binary")
            else:
                self.assertEqual(row.type, "real")

        for i in range(1, 10):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            if i in [3, 6, 9]:
                self.assertEqual(row.type, "binary")
            else:
                self.assertEqual(row.type, "real")

            self.assertEqual(row.relevant, False)
Example #23
0
def significane_test():
    X = df.iloc[:, :-1]
    dummy = np.random.rand(len(df.index))
    X['dummy'] = dummy

    y = df.iloc[:, -1]

    le = LabelEncoder()
    le.fit(
        ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio'])
    y_t = le.transform(y)
    y_t = pd.Series(y_t)

    print(calculate_relevance_table(X, y_t, ml_task='classification'))
Example #24
0
    def test_warning_for_no_relevant_feature(
            self, significance_test_feature_binary_mock,
            significance_test_feature_real_mock, X, y_real):
        significance_test_feature_binary_mock.return_value = 0.95
        significance_test_feature_real_mock.return_value = 0.95

        with mock.patch('logging.Logger.warning') as m:
            relevance_table = calculate_relevance_table(X,
                                                        y_real,
                                                        n_jobs=0,
                                                        ml_task="regression")
            m.assert_called_with(
                'No feature was found relevant for regression for fdr level = 0.05. '
                'Consider using a lower fdr level or other features.')
    def test_all_features_bad(self):
        # Mixed case with real target
        y = pd.Series(np.random.normal(0, 1, 1000))
        X = pd.DataFrame(index=range(1000))

        X["irr1"] = np.random.binomial(0, 0.1, 1000)
        X["irr2"] = np.random.binomial(0, 0.15, 1000)
        X["irr3"] = np.random.binomial(0, 0.05, 1000)
        X["irr4"] = np.random.binomial(0, 0.2, 1000)
        X["irr5"] = np.random.binomial(0, 0.25, 1000)
        X["irr6"] = np.random.binomial(0, 0.01, 1000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        self.assertEqual(len(feat_rej), 0)
    def test_all_features_bad(self):
        # Mixed case with real target
        y = pd.Series(np.random.normal(0, 1, 1000))
        X = pd.DataFrame(index=range(1000))

        X["irr1"] = np.random.binomial(0, 0.1, 1000)
        X["irr2"] = np.random.binomial(0, 0.15, 1000)
        X["irr3"] = np.random.binomial(0, 0.05, 1000)
        X["irr4"] = np.random.binomial(0, 0.2, 1000)
        X["irr5"] = np.random.binomial(0, 0.25, 1000)
        X["irr6"] = np.random.binomial(0, 0.01, 1000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        self.assertEqual(len(feat_rej), 0)
Example #27
0
    def test_warning_for_no_relevant_feature(
            self, significance_test_feature_binary_mock,
            significance_test_feature_real_mock, X, y_real):
        significance_test_feature_binary_mock.return_value = 0.95
        significance_test_feature_real_mock.return_value = 0.95

        with pytest.warns(RuntimeWarning) as record:
            _ = calculate_relevance_table(X,
                                          y_real,
                                          n_jobs=0,
                                          ml_task="regression",
                                          show_warnings=True)
            assert len(record) == 1
            assert str(record[0].message) == (
                "No feature was found relevant for regression for fdr level = 0.05 (which corresponds "
                "to the maximal percentage of irrelevant features, consider using an higher fdr level "
                "or add other features.")
def perform_fresh(X_train, y_train, X_test, y_test):
  log('Processing fresh')
  fresh_train_X, fresh_train_y = raw_to_tsfresh(X_train, y_train)
  fresh_test_X, fresh_test_y = raw_to_tsfresh(X_test, y_test)

  # Run the feature extraction and relevance tests ONLY on the train
  # data set.
  extracted_train = extract_features(fresh_train_X, column_id='id', column_value='value')
  extracted_train = extracted_train.dropna(axis='columns')

  # We run FRESH and its variants first at the default fdr level of 0.05,
  # but if it returns 0 features (why?) then we lower the value and try
  # again.  
  filtered_train = None
  for fdr in [0.05, 0.01, 0.005, 0.001, 0.00001]:
      log('Using ' + str(fdr))
      R = calculate_relevance_table(extracted_train, y_train.squeeze(), fdr_level=fdr)
      filtered_train = filter_features(extracted_train, R)
      if (filtered_train.shape[1] > 0):
          break

  # Extract features from the test set, but then apply the same relevant
  # features that we used from the train set
  extracted_test = extract_features(fresh_test_X, column_id='id', column_value='value')
  extracted_test = extracted_test.dropna(axis='columns')
  filtered_test = filter_features(extracted_test, R)

  # Train classifiers on the train set
  clf = build_rfc()
  trained_model = clf.fit(filtered_train, y_train.squeeze())
  rfc_predicted = list(map(lambda v: int(v), clf.predict(filtered_test)))

  actual = y_test.squeeze().tolist()

  # Create and fit an AdaBoosted decision tree
  bdt = build_ada()
  trained_model = bdt.fit(filtered_train, y_train.squeeze())
  ada_predicted = list(map(lambda v: int(v), bdt.predict(filtered_test)))

  return {
    'rfc':  accuracy_rate(rfc_predicted, actual),
    'ada': accuracy_rate(ada_predicted, actual),
    'rfc_count': len(clf.estimators_),
    'ada_count': len(bdt.estimators_),
  }
    def test_binomial_target_realvalued_features(self):
        # Real valued random variables and binomial target
        y = pd.Series(np.random.binomial(1, 0.5, 5000))
        X = pd.DataFrame(index=range(5000))

        for i in range(10):
            X["irr{}".format(i)] = np.random.normal(1, 0.3, 5000)

        for i in range(10, 20):
            X["irr{}".format(i)] = np.random.normal(1, 0.5, 5000)

        for i in range(20, 30):
            X["irr{}".format(i)] = np.random.normal(1, 0.8, 5000)

        X["rel1"] = y * np.random.normal(0, 1, 5000) + np.random.normal(
            0, 1, 5000)
        X["rel2"] = y + np.random.normal(0, 1, 5000)
        X["rel3"] = y**2 + np.random.normal(0, 1, 5000)
        X["rel4"] = np.sqrt(y) + np.random.binomial(2, 0.1, 5000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 5):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            self.assertEqual(row.type, "real")

        for i in range(1, 30):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            self.assertEqual(row.type, "real")

            self.assertEqual(row.relevant, False)
    def calculate_features_importance(self):

        x_train, y_train, x_test, y_test = self.create_train_test_data()

        Verbose.instance.print(1, f'Calculating importance for {len(self._features)} features')

        # x and y must to be pd.DataFrame and pd.Series for tsfresh
        x = pd.DataFrame(x_train)
        y = pd.Series(np.array(y_train).reshape(len(y_train)), index=x.index)

        t = calculate_relevance_table(x, y, **self._tsfresh_args).reindex(index=self._features)
        t.loc[t['relevant'] == False, 'p_value'] = t['p_value'].max()

        self._ranks = t['p_value'].tolist()

        self.logger.log({
            'date': datetime.datetime.now(),
            'name': self.name,
            'all_features': json.dumps(self._features),
            'ranking': json.dumps(self.ranks)
        })
Example #31
0
def calculate_feature_relevance(app_dir, valid_features, df_labels, postfixs):
    for df_label, postfix in zip(df_labels, postfixs):
        # calculate p-value
        excel_writer1 = pandas.ExcelWriter(
            os.path.join(app_dir, 'feature_p_value_%s.xlsx' % postfix))
        try:
            df_p_value = calculate_relevance_table(valid_features,
                                                   df_label['label'],
                                                   ml_task='classification')
        except BaseException:
            print('EXCETPTION')
            return
        df_p_value.to_excel(excel_writer1)
        excel_writer1.save()

        # arrange data format
        valid_features.reset_index(inplace=True)
        filtered_feature = {
            'id',
        }
        for _, row in df_p_value.iterrows():
            if row['relevant']:
                filtered_feature.add(row['feature'])
        if len(filtered_feature) <= 1:  # No correlated feature
            print('NO')
            return
        filtered_by_p = valid_features.loc[:, list(filtered_feature)]
        feature_with_label = pandas.merge(filtered_by_p, df_label,
                                          on='id').drop('id', 1)

        # calculate kendall correlation coefficient
        feature_kendall = feature_with_label.corr('kendall')['label'].drop(
            'label', 0).to_frame()
        feature_kendall.rename(
            columns={'label': 'kendall_correlation_coefficient'}, inplace=True)
        excel_writer2 = pandas.ExcelWriter(
            os.path.join(app_dir, 'feature_kendall_%s.xlsx' % postfix))
        df_kendall = pandas.DataFrame(feature_kendall)
        df_kendall.to_excel(excel_writer2)
        excel_writer2.save()
    def test_binomial_target_realvalued_features(self):
        # Real valued random variables and binomial target
        y = pd.Series(np.random.binomial(1, 0.5, 5000))
        X = pd.DataFrame(index=range(5000))

        for i in range(10):
            X["irr{}".format(i)] = np.random.normal(1, 0.3, 5000)

        for i in range(10, 20):
            X["irr{}".format(i)] = np.random.normal(1, 0.5, 5000)

        for i in range(20, 30):
            X["irr{}".format(i)] = np.random.normal(1, 0.8, 5000)

        X["rel1"] = y * np.random.normal(0, 1, 5000) + np.random.normal(0, 1, 5000)
        X["rel2"] = y + np.random.normal(0, 1, 5000)
        X["rel3"] = y ** 2 + np.random.normal(0, 1, 5000)
        X["rel4"] = np.sqrt(y) + np.random.binomial(2, 0.1, 5000)

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 5):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            self.assertEqual(row.type, "real")

        for i in range(1, 30):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            self.assertEqual(row.type, "real")

            self.assertEqual(row.relevant, False)
Example #33
0
    def fit(self, X, y):
        """
        Extract the information, which of the features are relevent using the given target.

        For more information, please see the :func:`~tsfresh.festure_selection.festure_selector.check_fs_sig_bh`
        function. All columns in the input data sample are treated as feature. The index of all
        rows in X must be present in y.

        :param X: data sample with the features, which will be classified as relevant or not
        :type X: pandas.DataFrame or numpy.array

        :param y: target vector to be used, to classify the features
        :type y: pandas.Series or numpy.array

        :return: the fitted estimator with the information, which features are relevant
        :rtype: FeatureSelector
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X.copy())

        if not isinstance(y, pd.Series):
            y = pd.Series(y.copy())

        relevance_table = calculate_relevance_table(
            X,
            y,
            ml_task=self.ml_task,
            n_jobs=self.n_jobs,
            chunksize=self.chunksize,
            fdr_level=self.fdr_level,
            hypotheses_independent=self.hypotheses_independent,
            test_for_binary_target_real_feature=self.
            test_for_binary_target_real_feature)
        self.relevant_features = relevance_table.loc[
            relevance_table.relevant].feature.tolist()
        self.feature_importances_ = 1.0 - relevance_table.p_value.values
        self.p_values = relevance_table.p_value.values
        self.features = relevance_table.index.tolist()

        return self
Example #34
0
 def load(self, trial_ids, iid=True):
     X_s = []
     y_s = []
     for trial_id in trial_ids:
         devices = self._load_devices(trial_id)
         X = self._extract_features(devices, trial_id)
         y = pd.Series(data=self._create_reliability_label(devices))
         X.sort_index(axis=1, inplace=True)
         if iid:
             idx_iid = y.iloc[::self.window_size].index.values
             X = X.loc[idx_iid]
             y = y.loc[idx_iid]
         X_s.append(X)
         y_s.append(y)
     X = pd.concat(X_s, sort=True)
     y = pd.concat(y_s)
     if self.selected_features is None:
         rel_table = calculate_relevance_table(X, y, n_jobs=N_JOBS)
         rel_table = rel_table.loc[rel_table['relevant'] == True]
         sorted_features = rel_table.sort_values(by='p_value')
         feature_names = sorted_features.index.tolist()
         if self.feature_limit is not None:
             feature_names = feature_names[:self.feature_limit]
             assert len(feature_names) == self.feature_limit
         X = X[feature_names]
         self.selected_features = feature_names
     else:
         X = X[self.selected_features]
     print("Data loaded for trials: " +
           ', '.join([str(x) for x in trial_ids]))
     print("X shape: {}, y shape: {}".format(X.shape, y.shape))
     print_label_counts(y)
     # print("Features used: ")
     # from pprint import pprint
     # pprint(self.selected_features)
     return X, y
Example #35
0
def select_features(X, y, test_for_binary_target_binary_feature=defaults.TEST_FOR_BINARY_TARGET_BINARY_FEATURE,
                    test_for_binary_target_real_feature=defaults.TEST_FOR_BINARY_TARGET_REAL_FEATURE,
                    test_for_real_target_binary_feature=defaults.TEST_FOR_REAL_TARGET_BINARY_FEATURE,
                    test_for_real_target_real_feature=defaults.TEST_FOR_REAL_TARGET_REAL_FEATURE,
                    fdr_level=defaults.FDR_LEVEL, hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT,
                    n_jobs=defaults.N_PROCESSES, chunksize=defaults.CHUNKSIZE,
                    ml_task='auto'):
    """
    Check the significance of all features (columns) of feature matrix X and return a possibly reduced feature matrix
    only containing relevant features.

    The feature matrix must be a pandas.DataFrame in the format:

        +-------+-----------+-----------+-----+-----------+
        | index | feature_1 | feature_2 | ... | feature_N |
        +=======+===========+===========+=====+===========+
        | A     | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | B     | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | ...   | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | ...   | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | ...   | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+


    Each column will be handled as a feature and tested for its significance to the target.

    The target vector must be a pandas.Series or numpy.array in the form

        +-------+--------+
        | index | target |
        +=======+========+
        | A     | ...    |
        +-------+--------+
        | B     | ...    |
        +-------+--------+
        | .     | ...    |
        +-------+--------+
        | .     | ...    |
        +-------+--------+

    and must contain all id's that are in the feature matrix. If y is a numpy.array without index, it is assumed
    that y has the same order and length than X and the rows correspond to each other.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features, select_features
    >>> df, y = load_robot_execution_failures()
    >>> X_extracted = extract_features(df, column_id='id', column_sort='time')
    >>> X_selected = select_features(X_extracted, y)

    :param X: Feature matrix in the format mentioned before which will be reduced to only the relevant features.
              It can contain both binary or real-valued features at the same time.
    :type X: pandas.DataFrame

    :param y: Target vector which is needed to test which features are relevant. Can be binary or real-valued.
    :type y: pandas.Series or numpy.ndarray

    :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused)
    :type test_for_binary_target_binary_feature: str

    :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature
    :type test_for_binary_target_real_feature: str

    :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused)
    :type test_for_real_target_binary_feature: str

    :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused)
    :type test_for_real_target_real_feature: str

    :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant
                      features among all created features.
    :type fdr_level: float

    :param hypotheses_independent: Can the significance of the features be assumed to be independent?
                                   Normally, this should be set to False as the features are never
                                   independent (e.g. mean and median)
    :type hypotheses_independent: bool

    :param n_jobs: Number of processes to use during the p-value calculation
    :type n_jobs: int

    :param chunksize: The size of one chunk that is submitted to the worker
        process for the parallelisation.  Where one chunk is defined as a
        singular time series for one id and one kind. If you set the chunksize
        to 10, then it means that one task is to calculate all features for 10
        time series.  If it is set it to None, depending on distributor,
        heuristics are used to find the optimal chunksize. If you get out of
        memory exceptions, you can try it with the dask distributor and a
        smaller chunksize.
    :type chunksize: None or int

    :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`.
                    Defaults to `'auto'`, meaning the intended task is inferred from `y`.
                    If `y` has a boolean, integer or object dtype, the task is assumend to be classification,
                    else regression.
    :type ml_task: str

    :return: The same DataFrame as X, but possibly with reduced number of columns ( = features).
    :rtype: pandas.DataFrame

    :raises: ``ValueError`` when the target vector does not fit to the feature matrix
             or `ml_task` is not one of `'auto'`, `'classification'` or `'regression'`.
    """
    assert isinstance(X, pd.DataFrame), "Please pass features in X as pandas.DataFrame."
    check_for_nans_in_columns(X)
    assert isinstance(y, (pd.Series, np.ndarray)), "The type of target vector y must be one of: " \
                                                   "pandas.Series, numpy.ndarray"
    assert len(y) > 1, "y must contain at least two samples."
    assert len(X) == len(y), "X and y must contain the same number of samples."
    assert len(set(y)) > 1, "Feature selection is only possible if more than 1 label/class is provided"

    if isinstance(y, pd.Series) and set(X.index) != set(y.index):
        raise ValueError("Index of X and y must be identical if provided")

    if isinstance(y, np.ndarray):
        y = pd.Series(y, index=X.index)

    relevance_table = calculate_relevance_table(
        X, y, ml_task=ml_task, n_jobs=n_jobs, chunksize=chunksize,
        test_for_binary_target_real_feature=test_for_binary_target_real_feature,
        fdr_level=fdr_level, hypotheses_independent=hypotheses_independent,
    )

    relevant_features = relevance_table[relevance_table.relevant].feature

    return X.loc[:, relevant_features]
Example #36
0
    def fit(self, X, y):
        """
        Extract the information, which of the features are relevant using the given target.

        For more information, please see the :func:`~tsfresh.festure_selection.festure_selector.check_fs_sig_bh`
        function. All columns in the input data sample are treated as feature. The index of all
        rows in X must be present in y.

        :param X: data sample with the features, which will be classified as relevant or not
        :type X: pandas.DataFrame or numpy.array

        :param y: target vector to be used, to classify the features
        :type y: pandas.Series or numpy.array

        :return: the fitted estimator with the information, which features are relevant
        :rtype: FeatureSelector
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X.copy())

        if not isinstance(y, pd.Series):
            y = pd.Series(y.copy())

        relevance_table = calculate_relevance_table(
            X,
            y,
            ml_task=self.ml_task,
            multiclass=self.multiclass,
            n_significant=self.n_significant,
            n_jobs=self.n_jobs,
            chunksize=self.chunksize,
            fdr_level=self.fdr_level,
            hypotheses_independent=self.hypotheses_independent,
            test_for_binary_target_real_feature=self.
            test_for_binary_target_real_feature,
        )
        self.relevant_features = relevance_table.loc[
            relevance_table.relevant].feature.tolist()

        if self.multiclass:
            p_values_table = relevance_table.filter(regex="^p_value_*", axis=1)
            if self.multiclass_p_values == "all":
                self.p_values = p_values_table
                self.feature_importances_ = 1.0 - p_values_table
                self.feature_importances_.columns = (
                    self.feature_importances_.columns.str.lstrip("p_value"))
                self.feature_importances_ = self.feature_importances_.add_prefix(
                    "importance_")
            elif self.multiclass_p_values == "min":
                self.p_values = p_values_table.min(axis=1).values
            elif self.multiclass_p_values == "max":
                self.p_values = p_values_table.max(axis=1).values
            elif self.multiclass_p_values == "avg":
                self.p_values = p_values_table.mean(axis=1).values

            if self.multiclass_p_values != "all":
                # raise p_values to the power of n_significant to increase importance
                # of features which are significant for more classes
                self.feature_importances_ = (
                    1.0 - self.p_values**relevance_table.n_significant.values)
        else:
            self.feature_importances_ = 1.0 - relevance_table.p_value.values
            self.p_values = relevance_table.p_value.values

        self.features = relevance_table.index.tolist()

        return self
def select_features(X,
                    y,
                    test_for_binary_target_binary_feature=defaults.
                    TEST_FOR_BINARY_TARGET_BINARY_FEATURE,
                    test_for_binary_target_real_feature=defaults.
                    TEST_FOR_BINARY_TARGET_REAL_FEATURE,
                    test_for_real_target_binary_feature=defaults.
                    TEST_FOR_REAL_TARGET_BINARY_FEATURE,
                    test_for_real_target_real_feature=defaults.
                    TEST_FOR_REAL_TARGET_REAL_FEATURE,
                    fdr_level=defaults.FDR_LEVEL,
                    hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT,
                    n_jobs=defaults.N_PROCESSES,
                    show_warnings=defaults.SHOW_WARNINGS,
                    chunksize=defaults.CHUNKSIZE,
                    ml_task='auto'):
    """
    Check the significance of all features (columns) of feature matrix X and return a possibly reduced feature matrix
    only containing relevant features.

    The feature matrix must be a pandas.DataFrame in the format:

        +-------+-----------+-----------+-----+-----------+
        | index | feature_1 | feature_2 | ... | feature_N |
        +=======+===========+===========+=====+===========+
        | A     | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | B     | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | ...   | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | ...   | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+
        | ...   | ...       | ...       | ... | ...       |
        +-------+-----------+-----------+-----+-----------+


    Each column will be handled as a feature and tested for its significance to the target.

    The target vector must be a pandas.Series or numpy.array in the form

        +-------+--------+
        | index | target |
        +=======+========+
        | A     | ...    |
        +-------+--------+
        | B     | ...    |
        +-------+--------+
        | .     | ...    |
        +-------+--------+
        | .     | ...    |
        +-------+--------+

    and must contain all id's that are in the feature matrix. If y is a numpy.array without index, it is assumed
    that y has the same order and length than X and the rows correspond to each other.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features, select_features
    >>> df, y = load_robot_execution_failures()
    >>> X_extracted = extract_features(df, column_id='id', column_sort='time')
    >>> X_selected = select_features(X_extracted, y)

    :param X: Feature matrix in the format mentioned before which will be reduced to only the relevant features.
              It can contain both binary or real-valued features at the same time.
    :type X: pandas.DataFrame

    :param y: Target vector which is needed to test which features are relevant. Can be binary or real-valued.
    :type y: pandas.Series or numpy.ndarray

    :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature
                                                  (currently unused)
    :type test_for_binary_target_binary_feature: str

    :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature
    :type test_for_binary_target_real_feature: str

    :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused)
    :type test_for_real_target_binary_feature: str

    :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused)
    :type test_for_real_target_real_feature: str

    :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant
                      features among all created features.
    :type fdr_level: float

    :param hypotheses_independent: Can the significance of the features be assumed to be independent?
                                   Normally, this should be set to False as the features are never
                                   independent (e.g. mean and median)
    :type hypotheses_independent: bool

    :param n_jobs: Number of processes to use during the p-value calculation
    :type n_jobs: int

    :param show_warnings: Show warnings during the p-value calculation (needed for debugging of calculators).
    :type show_warnings: bool

    :param chunksize: The size of one chunk that is submitted to the worker
        process for the parallelisation.  Where one chunk is defined as
        the data for one feature. If you set the chunksize
        to 10, then it means that one task is to filter 10 features.
        If it is set it to None, depending on distributor,
        heuristics are used to find the optimal chunksize. If you get out of
        memory exceptions, you can try it with the dask distributor and a
        smaller chunksize.
    :type chunksize: None or int

    :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`.
                    Defaults to `'auto'`, meaning the intended task is inferred from `y`.
                    If `y` has a boolean, integer or object dtype, the task is assumend to be classification,
                    else regression.
    :type ml_task: str

    :return: The same DataFrame as X, but possibly with reduced number of columns ( = features).
    :rtype: pandas.DataFrame

    :raises: ``ValueError`` when the target vector does not fit to the feature matrix
             or `ml_task` is not one of `'auto'`, `'classification'` or `'regression'`.
    """
    assert isinstance(
        X, pd.DataFrame), "Please pass features in X as pandas.DataFrame."
    check_for_nans_in_columns(X)
    assert isinstance(y, (pd.Series, np.ndarray)), "The type of target vector y must be one of: " \
                                                   "pandas.Series, numpy.ndarray"
    assert len(y) > 1, "y must contain at least two samples."
    assert len(X) == len(y), "X and y must contain the same number of samples."
    assert len(
        set(y)
    ) > 1, "Feature selection is only possible if more than 1 label/class is provided"

    if isinstance(y, pd.Series) and set(X.index) != set(y.index):
        raise ValueError("Index of X and y must be identical if provided")

    if isinstance(y, np.ndarray):
        y = pd.Series(y, index=X.index)

    relevance_table = calculate_relevance_table(
        X,
        y,
        ml_task=ml_task,
        n_jobs=n_jobs,
        show_warnings=show_warnings,
        chunksize=chunksize,
        test_for_binary_target_real_feature=test_for_binary_target_real_feature,
        fdr_level=fdr_level,
        hypotheses_independent=hypotheses_independent,
    )

    relevant_features = relevance_table[relevance_table.relevant].feature

    return X.loc[:, relevant_features]
    def test_binary_target_binary_features(self):
        # Binomial random variables and binomial target
        y = pd.Series(np.random.binomial(1, 0.5, 5000))
        X = pd.DataFrame(index=range(5000))

        for i in range(10):
            X["irr{}".format(i)] = np.random.binomial(1, 0.1, 5000)

        for i in range(10, 20):
            X["irr{}".format(i)] = np.random.binomial(1, 0.8, 5000)

        z = y - np.random.binomial(1, 0.01, 5000) + np.random.binomial(
            1, 0.01, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel1"] = z

        z = y - np.random.binomial(1, 0.05, 5000) + np.random.binomial(
            1, 0.05, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel2"] = z

        z = y - np.random.binomial(1, 0.10, 5000) + np.random.binomial(
            1, 0.10, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel3"] = z

        z = y - np.random.binomial(1, 0.15, 5000) + np.random.binomial(
            1, 0.15, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel4"] = z

        z = y - np.random.binomial(1, 0.20, 5000) + np.random.binomial(
            1, 0.20, 5000)
        z[z == -1] = 0
        z[z == 2] = 1
        X["rel5"] = z

        df_bh = calculate_relevance_table(X, y)
        feat_rej = df_bh.loc[df_bh.relevant].feature

        # Make sure all selected variables are relevant
        for kept_feature in feat_rej:
            self.assertIn(kept_feature,
                          ['rel1', 'rel2', 'rel3', 'rel4', 'rel5'])

        self.assertGreater(len(feat_rej), 0)

        # Test type outputs
        for i in range(1, 6):
            row = df_bh.loc["rel{}".format(i)]
            self.assertEqual(row.feature, "rel{}".format(i))
            self.assertEqual(row.type, "binary")

        for i in range(1, 20):
            row = df_bh.loc["irr{}".format(i)]
            self.assertEqual(row.feature, "irr{}".format(i))
            self.assertEqual(row.type, "binary")

            self.assertEqual(row.relevant, False)
    def fit(self, train_files):
        window_data, shap_window_data = self.prep_data(train_files)

        # Extract clinical variables
        clin_features = []
        for file in train_files:
            names, values = self.read_clin_fn(file)
            clin_features.append([file] + values)
        clin_df = pd.DataFrame(clin_features, columns=['file'] + names)
        clin_df = self.proc_clin_fn(clin_df)

        # Extract features for each channel separately
        features_per_channel = []
        self.feature_extractors_per_channel = {}
        for ch in range(window_data.windows.shape[1]):
            self.feature_extractors_per_channel[ch] = []
            for feature_extractor in self.features:
                self.feature_extractors_per_channel[ch].append(
                    feature_extractor())

            channel_features = []
            for f in self.feature_extractors_per_channel[ch]:
                features = f.fit_transform(window_data.windows[:, ch, :],
                                           window_data.labels)
                features = pd.DataFrame(
                    features,
                    columns=['{}_ch{}'.format(x, ch) for x in f.names_])
                channel_features.append(features)
            features_per_channel.append(pd.concat(channel_features, axis=1))

        short_features_per_channel = []
        self.short_feature_extractors_per_channel = {}
        for ch in range(window_data.windows.shape[1]):
            self.short_feature_extractors_per_channel[ch] = []
            for feature_extractor in self.short_features:
                self.short_feature_extractors_per_channel[ch].append(
                    feature_extractor())

            channel_features = []
            for f in self.short_feature_extractors_per_channel[ch]:
                f.fit(shap_window_data.windows[:, ch, :],
                      shap_window_data.labels)
                features = f.transform(window_data.windows[:, ch, :],
                                       window_data.labels)
                features = pd.DataFrame(
                    features,
                    columns=['{}_ch{}'.format(x, ch) for x in f.names_])
                channel_features.append(features)
            short_features_per_channel.append(
                pd.concat(channel_features, axis=1))

        features_multi_channel = []
        for f in self.multi_channel_features:
            features = f.fit_transform(window_data.windows, window_data.labels)
            features = pd.DataFrame(
                features, columns=['{}_ch{}'.format(x, ch) for x in f.names_])
            features_multi_channel.append(features)

        # Concatenate the features of different channels together
        train_features = pd.concat(features_per_channel +
                                   short_features_per_channel +
                                   features_multi_channel,
                                   axis=1)
        train_features['file'] = window_data.files
        train_features = train_features.merge(clin_df, on='file')

        # Create our X and y
        X_train = train_features
        y_train = np.array(window_data.labels)
        for col in ['ID', 'file']:
            if col in X_train.columns:
                X_train = X_train.drop(col, axis=1)

        X_train = X_train.astype(float)

        # useless_features = self.remove_features(X_train)
        # X_train = X_train.drop(useless_features, axis=1)

        # Now apply hypothesis testing on remaining features
        rel_table = calculate_relevance_table(X_train, pd.Series(y_train))
        self.rel_features = list(rel_table[rel_table['p_value'] <= 0.05].index)

        X_train = X_train[self.rel_features]

        # Create validation set for early stopping
        val_files = np.random.choice(train_files,
                                     size=int(0.1 * len(train_files)),
                                     replace=False)
        all_files = np.array(window_data.files)
        X_val = X_train.loc[np.isin(window_data.files, val_files), :]
        y_val = y_train[np.isin(window_data.files, val_files)]
        X_train = X_train.loc[~np.isin(window_data.files, val_files), :]
        y_train = y_train[~np.isin(window_data.files, val_files)]

        # Fit our gradient boosting classifier
        self.clf = CatBoostClassifier(
            iterations=10000,
            od_type='Iter',
            od_wait=50,
            objective='CrossEntropy',
            random_seed=2018,
            #eval_metric='AUC',
            use_best_model=True,
            task_type='CPU')

        self.clf.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=100)

        return train_features
Example #40
0
 def test_multiclass_requires_classification(self, X, y_real):
     with pytest.raises(AssertionError):
         calculate_relevance_table(X,
                                   y_real,
                                   multiclass=True,
                                   ml_task="regression")
Example #41
0
    def test_multiclass_relevance_table_columns(self, X, y_binary):
        y = y_binary.copy()
        y[2] = 2
        relevance_table = calculate_relevance_table(X, y, multiclass=True)

        assert len(relevance_table.columns) == 10
Example #42
0
 def test_restrict_ml_task_options(self, X, y_binary):
     with pytest.raises(ValueError):
         calculate_relevance_table(X, y_binary, ml_task='some_other_task')
Example #43
0
 def test_restrict_ml_task_options(self, X, y_binary):
     with pytest.raises(ValueError):
         calculate_relevance_table(X, y_binary, ml_task='some_other_task')