def test_multiclass_n_significant_error(self, X, y_binary): with pytest.raises(AssertionError): calculate_relevance_table(X, y_binary, multiclass=True, n_significant=3, ml_task="classification")
def test_real_target_binary_features(self): # Mixed case with real target y = pd.Series(np.random.normal(0, 1, 1000)) X = pd.DataFrame(index=range(1000)) z = y - np.random.binomial(1, 0.20, 1000) + np.random.binomial(1, 0.20, 1000) z[z == -1] = 0 z[z == 2] = 1 X["rel1"] = z z = y - np.random.binomial(1, 0.10, 1000) + np.random.binomial(1, 0.10, 1000) z[z == -1] = 0 z[z == 2] = 1 X["rel2"] = z X["irr1"] = np.random.binomial(0, 0.1, 1000) X["irr2"] = np.random.binomial(0, 0.15, 1000) X["irr3"] = np.random.binomial(0, 0.05, 1000) X["irr4"] = np.random.binomial(0, 0.2, 1000) X["irr5"] = np.random.binomial(0, 0.25, 1000) X["irr6"] = np.random.binomial(0, 0.01, 1000) df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature # Make sure all selected variables are relevant for kept_feature in feat_rej: self.assertIn(kept_feature, ['rel1', 'rel2']) self.assertGreater(len(feat_rej), 0)
def test_target_real_calls_correct_tests( self, significance_test_feature_binary_mock, significance_test_feature_real_mock, X, y_real, ): significance_test_feature_binary_mock.return_value = 0.5 significance_test_feature_real_mock.return_value = 0.7 relevance_table = calculate_relevance_table(X, y_real, n_jobs=0) assert 0.5 == relevance_table.loc["feature_binary"].p_value assert 0.7 == relevance_table.loc["feature_real"].p_value assert significance_test_feature_binary_mock.call_count == 1 pd.testing.assert_series_equal( significance_test_feature_binary_mock.call_args[0][0], X["feature_binary"]) pd.testing.assert_series_equal( significance_test_feature_binary_mock.call_args[1]["y"], y_real) assert significance_test_feature_real_mock.call_count == 1 pd.testing.assert_series_equal( significance_test_feature_real_mock.call_args[0][0], X["feature_real"]) pd.testing.assert_series_equal( significance_test_feature_real_mock.call_args[1]["y"], y_real)
def test_multiclass_correct_features_relevant(self, y_multi): X_multi = pd.DataFrame() X_multi["relevant_0"] = np.concatenate( [np.zeros(298), np.array([0.01, -0.01])]) X_multi["relevant_3"] = X_multi["relevant_0"].copy() X_multi["relevant_3"][y_multi == 0] = np.random.uniform(2, 3, 100) X_multi["relevant_2"] = X_multi["relevant_3"].copy() X_multi["relevant_2"][y_multi == 1] = np.random.uniform(-2, -1, 100) relevance_table = calculate_relevance_table(X_multi, y_multi, multiclass=True, ml_task="classification", n_significant=3) assert relevance_table.loc["relevant_3", "relevant"] assert not relevance_table.loc["relevant_2", "relevant"] assert not relevance_table.loc["relevant_0", "relevant"] # the distributions of all 3 classes under a one vs. rest scheme will be separated enough for # this feature to be relevant for predicting 3 classes assert relevance_table.loc["relevant_3", "n_significant"] == 3 # due to the distribution of this feature where y_multi == 0 being contained inside the range of # y_multi != 0 it will not pass the Mann-Whitney U test under a one vs. rest scheme for that class assert relevance_table.loc["relevant_2", "n_significant"] == 2 assert relevance_table.loc["relevant_0", "n_significant"] == 0
def fit(self, X, y): """ Extract the information, which of the features are relevent using the given target. For more information, please see the :func:`~tsfresh.festure_selection.festure_selector.check_fs_sig_bh` function. All columns in the input data sample are treated as feature. The index of all rows in X must be present in y. :param X: data sample with the features, which will be classified as relevant or not :type X: pandas.DataFrame or numpy.array :param y: target vector to be used, to classify the features :type y: pandas.Series or numpy.array :return: the fitted estimator with the information, which features are relevant :rtype: FeatureSelector """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X.copy()) if not isinstance(y, pd.Series): y = pd.Series(y.copy()) relevance_table = calculate_relevance_table( X, y, ml_task=self.ml_task, n_jobs=self.n_jobs, chunksize=self.chunksize, fdr_level=self.fdr_level, hypotheses_independent=self.hypotheses_independent, test_for_binary_target_real_feature=self.test_for_binary_target_real_feature) self.relevant_features = relevance_table.loc[relevance_table.relevant].feature.tolist() self.feature_importances_ = 1.0 - relevance_table.p_value.values self.p_values = relevance_table.p_value.values self.features = relevance_table.index.tolist() return self
def test_all_features_good(self): # Mixed case with real target y = pd.Series(np.random.normal(0, 1, 1000)) X = pd.DataFrame(index=range(1000)) z = y - np.random.binomial(1, 0.20, 1000) + np.random.binomial( 1, 0.20, 1000) z[z == -1] = 0 z[z == 2] = 1 X["rel1"] = z z = y - np.random.binomial(1, 0.10, 1000) + np.random.binomial( 1, 0.10, 1000) z[z == -1] = 0 z[z == 2] = 1 X["rel2"] = z df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature # Make sure all selected variables are relevant for kept_feature in feat_rej: self.assertIn(kept_feature, ['rel1', 'rel2']) self.assertGreater(len(feat_rej), 0)
def calculate_feature_relevance(app_dir, valid_features_list, df_labels): excel_writer1 = pandas.ExcelWriter(os.path.join(app_dir, 'feature_p_value.xlsx')) excel_writer2 = pandas.ExcelWriter(os.path.join(app_dir, 'feature_kendall.xlsx')) sheet_count = 0 for df_valid_features in valid_features_list: # one sheet of features # calculate p-value print(sheet_count) df_p_value = calculate_relevance_table(df_valid_features, df_labels['label'], ml_task='classification') df_p_value.to_excel(excel_writer1, '%d' % sheet_count) # arrange data format df_valid_features.reset_index(inplace=True) # print(df_valid_features.columns) filtered_feature = {'id', } for _, row in df_p_value.iterrows(): if row['relevant']: filtered_feature.add(row['feature']) filtered_by_p = df_valid_features[list(filtered_feature)] # print(filtered_by_p[list(filtered_feature)[-1]]) # print(filtered_by_p[list(filtered_feature)[-2]]) # print(filtered_by_p['id']) # print(df_labels['id']) feature_with_label = pandas.merge(filtered_by_p, df_labels, on='id').drop('id', 1) # print(feature_with_label) # sys.exit(0) # calculate kendall correlation coefficient feature_kendall = feature_with_label.corr('kendall')['label'].drop('label', 0).to_frame() feature_kendall.rename(columns={'label': 'kendall_correlation_coefficient'}, inplace=True) df_kendall = pandas.DataFrame(feature_kendall) df_kendall.to_excel(excel_writer2, '%d' % sheet_count) sheet_count += 1 excel_writer1.save() excel_writer2.save()
def test_constant_feature_irrelevant(self, y_binary): X = pd.DataFrame([1, 1, 1], columns=['feature_binary']) relevance_table = calculate_relevance_table(X, y_binary) assert "feature_binary" == relevance_table.index[0] assert 'constant' == relevance_table.type[0] assert np.isnan(relevance_table.p_value[0]) assert False == relevance_table.relevant[0]
def test_constant_feature_irrelevant(self, y_binary): X = pd.DataFrame([1, 1, 1], columns=['feature_binary']) relevance_table = calculate_relevance_table(X, y_binary) assert "feature_binary" == relevance_table.index[0] assert 'constant' == relevance_table.type[0] assert np.isnan(relevance_table.p_value[0]) assert False == relevance_table.relevant[0]
def test_binary_target_mixed_case(self): # Mixed case with binomial target np.random.seed(42) y = pd.Series(np.random.binomial(1, 0.5, 1000)) X = pd.DataFrame(index=range(1000)) z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial( 1, 0.1, 1000) z[z == -1] = 0 z[z == 2] = 1 X["rel1"] = z X["rel2"] = y * np.abs(np.random.normal( 0, 1, 1000)) + np.random.normal(0, 1, 1000) X["rel3"] = y + np.random.normal(0, 0.3, 1000) X["rel4"] = y**2 + np.random.normal(0, 1, 1000) X["rel5"] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000) X["irr_constant"] = 1.113344 X["irr1"] = np.random.normal(0, 1, 1000) X["irr2"] = np.random.poisson(1, 1000) X["irr3"] = np.random.binomial(1, 0.3, 1000) X["irr4"] = np.random.normal(0, 1, 1000) X["irr5"] = np.random.poisson(1, 1000) X["irr6"] = np.random.binomial(1, 0.3, 1000) X["irr7"] = np.random.normal(0, 1, 1000) X["irr8"] = np.random.poisson(1, 1000) X["irr9"] = np.random.binomial(1, 0.3, 1000) df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature # Make sure all selected variables are relevant for kept_feature in feat_rej: self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4', 'rel5']) self.assertGreater(len(feat_rej), 0) # Test type outputs for i in range(1, 6): row = df_bh.loc["rel{}".format(i)] self.assertEqual(row.feature, "rel{}".format(i)) if i == 1: self.assertEqual(row.type, "binary") else: self.assertEqual(row.type, "real") for i in range(1, 10): row = df_bh.loc["irr{}".format(i)] self.assertEqual(row.feature, "irr{}".format(i)) if i not in [3, 6, 9]: self.assertEqual(row.type, "real") else: self.assertEqual(row.type, "binary") self.assertEqual(row.relevant, False)
def test_binary_target_binary_features(self): # Binomial random variables and binomial target y = pd.Series(np.random.binomial(1, 0.5, 5000)) X = pd.DataFrame(index=range(5000)) for i in range(10): X["irr{}".format(i)] = np.random.binomial(1, 0.1, 5000) for i in range(10, 20): X["irr{}".format(i)] = np.random.binomial(1, 0.8, 5000) z = y - np.random.binomial(1, 0.01, 5000) + np.random.binomial(1, 0.01, 5000) z[z == -1] = 0 z[z == 2] = 1 X["rel1"] = z z = y - np.random.binomial(1, 0.05, 5000) + np.random.binomial(1, 0.05, 5000) z[z == -1] = 0 z[z == 2] = 1 X["rel2"] = z z = y - np.random.binomial(1, 0.10, 5000) + np.random.binomial(1, 0.10, 5000) z[z == -1] = 0 z[z == 2] = 1 X["rel3"] = z z = y - np.random.binomial(1, 0.15, 5000) + np.random.binomial(1, 0.15, 5000) z[z == -1] = 0 z[z == 2] = 1 X["rel4"] = z z = y - np.random.binomial(1, 0.20, 5000) + np.random.binomial(1, 0.20, 5000) z[z == -1] = 0 z[z == 2] = 1 X["rel5"] = z df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature # Make sure all selected variables are relevant for kept_feature in feat_rej: self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4', 'rel5']) self.assertGreater(len(feat_rej), 0) # Test type outputs for i in range(1, 6): row = df_bh.loc["rel{}".format(i)] self.assertEqual(row.feature, "rel{}".format(i)) self.assertEqual(row.type, "binary") for i in range(1, 20): row = df_bh.loc["irr{}".format(i)] self.assertEqual(row.feature, "irr{}".format(i)) self.assertEqual(row.type, "binary") self.assertEqual(row.relevant, False)
def test_warning_for_no_relevant_feature(self, significance_test_feature_binary_mock, significance_test_feature_real_mock, X, y_real): significance_test_feature_binary_mock.return_value = 0.95 significance_test_feature_real_mock.return_value = 0.95 with mock.patch('logging.Logger.warning') as m: relevance_table = calculate_relevance_table(X, y_real, n_jobs=0, ml_task="regression") m.assert_called_with('No feature was found relevant for regression for fdr level = 0.05. ' 'Consider using a lower fdr level or other features.')
def select_relevant_features(self, X, y): '''Select statistically significant features while computing the relevance of these features.''' # calculate relevance tables for each binary class pair relevance_tables = list() for label in np.unique(y): y_binary = (y == label) relevance_tables.append( (label, calculate_relevance_table(X, y_binary, fdr_level=self.fdr_level, n_jobs=self.n_jobs))) # concatenate relevance tables relevance_table_concat = pd.concat( [table for (lable, table) in relevance_tables]) # perform benjamini hochberg test relevance_table_benjamini = benjamini_hochberg_test( relevance_table_concat, hypotheses_independent=False, fdr_level=self.fdr_level) # remove irrelevant features from the table relevance_table_benjamini = relevance_table_benjamini[ relevance_table_benjamini.relevant == True] # select features occurred at least twice in the table feature_occurrences = relevance_table_benjamini.feature.value_counts() relevant_features = feature_occurrences[feature_occurrences == len( y.unique())].index.values occurrence_counts = feature_occurrences.value_counts() for i in range(1, 4): try: logging.info( 'Number of features occurred {} time(s) in the relevant features selected after benjamini hochberg test: {}' .format(i, occurrence_counts[i])) except ( KeyError, IndexError ): # when there is no feature occur the corresponding number of times pass # build final relevance table relevance_table_final = pd.DataFrame({ 'feature': relevant_features, 'p_value': [ relevance_table_benjamini.loc[f].p_value.max() for f in relevant_features ], 'occurrence': [feature_occurrences[f] for f in relevant_features] }).sort_values(by=['p_value', 'occurrence']).reset_index(drop=True) logging.info( "Number of relevant features for all classes: {}/{}".format( relevance_table_final.shape[0], X.shape[1])) return relevance_table_final
def test_warning_for_no_relevant_feature(self, significance_test_feature_binary_mock, significance_test_feature_real_mock, X, y_real): significance_test_feature_binary_mock.return_value = 0.95 significance_test_feature_real_mock.return_value = 0.95 with mock.patch('logging.Logger.warning') as m: _ = calculate_relevance_table(X, y_real, n_jobs=0, ml_task="regression") m.assert_called_with("No feature was found relevant for regression for fdr level = 0.05 (which corresponds " "to the maximal percentage of irrelevant features, consider using an higher fdr level " "or add other features.")
def test_target_binary_calls_correct_tests(self, significance_test_feature_binary_mock, significance_test_feature_real_mock, X, y_binary): significance_test_feature_binary_mock.return_value = 0.5 significance_test_feature_real_mock.return_value = 0.7 relevance_table = calculate_relevance_table(X, y_binary, n_jobs=0) assert 0.5 == relevance_table.loc['feature_binary'].p_value assert 0.7 == relevance_table.loc['feature_real'].p_value assert 2 == significance_test_feature_binary_mock.call_count assert 2 == significance_test_feature_real_mock.call_count
def test_target_binary_calls_correct_tests(self, significance_test_feature_binary_mock, significance_test_feature_real_mock, X, y_binary): significance_test_feature_binary_mock.return_value = 0.5 significance_test_feature_real_mock.return_value = 0.7 relevance_table = calculate_relevance_table(X, y_binary, n_jobs=0) assert 0.5 == relevance_table.loc['feature_binary'].p_value assert 0.7 == relevance_table.loc['feature_real'].p_value assert 2 == significance_test_feature_binary_mock.call_count assert 2 == significance_test_feature_real_mock.call_count
def test_binary_target_mixed_case(self): # Mixed case with binomial target np.random.seed(42) y = pd.Series(np.random.binomial(1, 0.5, 1000)) X = pd.DataFrame(index=range(1000)) z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(1, 0.1, 1000) z[z == -1] = 0 z[z == 2] = 1 X["rel1"] = z X["rel2"] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 1, 1000) X["rel3"] = y + np.random.normal(0, 0.3, 1000) X["rel4"] = y ** 2 + np.random.normal(0, 1, 1000) X["rel5"] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000) X["irr_constant"] = 1.113344 X["irr1"] = np.random.normal(0, 1, 1000) X["irr2"] = np.random.poisson(1, 1000) X["irr3"] = np.random.binomial(1, 0.3, 1000) X["irr4"] = np.random.normal(0, 1, 1000) X["irr5"] = np.random.poisson(1, 1000) X["irr6"] = np.random.binomial(1, 0.3, 1000) X["irr7"] = np.random.normal(0, 1, 1000) X["irr8"] = np.random.poisson(1, 1000) X["irr9"] = np.random.binomial(1, 0.3, 1000) df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature # Make sure all selected variables are relevant for kept_feature in feat_rej: self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4', 'rel5']) self.assertGreater(len(feat_rej), 0) # Test type outputs for i in range(1, 6): row = df_bh.loc["rel{}".format(i)] self.assertEqual(row.feature, "rel{}".format(i)) if i == 1: self.assertEqual(row.type, "binary") else: self.assertEqual(row.type, "real") for i in range(1, 10): row = df_bh.loc["irr{}".format(i)] self.assertEqual(row.feature, "irr{}".format(i)) if i not in [3, 6, 9]: self.assertEqual(row.type, "real") else: self.assertEqual(row.type, "binary") self.assertEqual(row.relevant, False)
def test_target_real_calls_correct_tests(self, significance_test_feature_binary_mock, significance_test_feature_real_mock, X, y_real): significance_test_feature_binary_mock.return_value = 0.5 significance_test_feature_real_mock.return_value = 0.7 relevance_table = calculate_relevance_table(X, y_real, n_jobs=0) assert 0.5 == relevance_table.loc['feature_binary'].p_value assert 0.7 == relevance_table.loc['feature_real'].p_value significance_test_feature_binary_mock.assert_called_once_with(X['feature_binary'], y=y_real) significance_test_feature_real_mock.assert_called_once_with(X['feature_real'], y=y_real)
def test_target_real_calls_correct_tests(self, significance_test_feature_binary_mock, significance_test_feature_real_mock, X, y_real): significance_test_feature_binary_mock.return_value = 0.5 significance_test_feature_real_mock.return_value = 0.7 relevance_table = calculate_relevance_table(X, y_real, n_jobs=0) assert 0.5 == relevance_table.loc['feature_binary'].p_value assert 0.7 == relevance_table.loc['feature_real'].p_value significance_test_feature_binary_mock.assert_called_once_with(X['feature_binary'], y=y_real) significance_test_feature_real_mock.assert_called_once_with(X['feature_real'], y=y_real)
def perform_fresh_pca_after(X_train, y_train, X_test, y_test): log('Processing fresh_pca_after') fresh_train_X, fresh_train_y = raw_to_tsfresh(X_train, y_train) fresh_test_X, fresh_test_y = raw_to_tsfresh(X_test, y_test) # Run the feature extraction and relevance tests ONLY on the train # data set. extracted_train = extract_features(fresh_train_X, column_id='id', column_value='value') # For some reason, tsfresh is extracting features that contain Nan, # Infinity or None. This breaks the PCA step. To avoid this, we # drop columns that contain these values. I know of nothing else to do here. extracted_train = extracted_train.dropna(axis='columns') filtered_train = None # execute at different fdr levels to try to make FRESH more robust for fdr in [0.05, 0.01, 0.005, 0.001]: R = calculate_relevance_table(extracted_train, y_train.squeeze(), fdr_level=fdr) filtered_train = filter_features(extracted_train, R) if (filtered_train.shape[1] > 0): break # Perform PCA on the filtered set of features pca_train = PCAForPandas(n_components=0.95, svd_solver='full') filtered_train = pca_train.fit_transform(filtered_train) # Extract features from the test set, but then apply the same relevant # features that we used from the train set extracted_test = extract_features(fresh_test_X, column_id='id', column_value='value') extracted_test = extracted_test.dropna(axis='columns') filtered_test = filter_features(extracted_test, R) filtered_test = pca_train.transform(filtered_test) # Train classifiers on the train set clf = build_rfc() trained_model = clf.fit(filtered_train, y_train.squeeze()) rfc_predicted = list(map(lambda v: int(v), clf.predict(filtered_test))) actual = y_test.squeeze().tolist() # Create and fit an AdaBoosted decision tree bdt = build_ada() trained_model = bdt.fit(filtered_train, y_train.squeeze()) ada_predicted = list(map(lambda v: int(v), bdt.predict(filtered_test))) return { 'rfc': accuracy_rate(rfc_predicted, actual), 'ada': accuracy_rate(ada_predicted, actual), 'rfc_count': len(clf.estimators_), 'ada_count': len(bdt.estimators_), }
def test_real_target_mixed_case(self): # Mixed case with real target y = pd.Series(np.random.normal(0, 1, 5000)) X = pd.DataFrame(index=range(5000)) z = y.copy() z[z <= 0] = 0 z[z > 0] = 1 X["rel1"] = z X["rel2"] = y X["rel3"] = y ** 2 X["rel4"] = np.sqrt(abs(y)) X["irr1"] = np.random.normal(0, 1, 5000) X["irr2"] = np.random.poisson(1, 5000) X["irr3"] = np.random.binomial(1, 0.1, 5000) X["irr4"] = np.random.normal(0, 1, 5000) X["irr5"] = np.random.poisson(1, 5000) X["irr6"] = np.random.binomial(1, 0.05, 5000) X["irr7"] = np.random.normal(0, 1, 5000) X["irr8"] = np.random.poisson(1, 5000) X["irr9"] = np.random.binomial(1, 0.2, 5000) df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature # Make sure all selected variables are relevant for kept_feature in feat_rej: self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4']) self.assertGreater(len(feat_rej), 0) # Test type outputs for i in range(1, 5): row = df_bh.loc["rel{}".format(i)] self.assertEqual(row.feature, "rel{}".format(i)) if i == 1: self.assertEqual(row.type, "binary") else: self.assertEqual(row.type, "real") for i in range(1, 10): row = df_bh.loc["irr{}".format(i)] self.assertEqual(row.feature, "irr{}".format(i)) if i in [3, 6, 9]: self.assertEqual(row.type, "binary") else: self.assertEqual(row.type, "real") self.assertEqual(row.relevant, False)
def test_real_target_mixed_case(self): # Mixed case with real target y = pd.Series(np.random.normal(0, 1, 5000)) X = pd.DataFrame(index=range(5000)) z = y.copy() z[z <= 0] = 0 z[z > 0] = 1 X["rel1"] = z X["rel2"] = y X["rel3"] = y**2 X["rel4"] = np.sqrt(abs(y)) X["irr1"] = np.random.normal(0, 1, 5000) X["irr2"] = np.random.poisson(1, 5000) X["irr3"] = np.random.binomial(1, 0.1, 5000) X["irr4"] = np.random.normal(0, 1, 5000) X["irr5"] = np.random.poisson(1, 5000) X["irr6"] = np.random.binomial(1, 0.05, 5000) X["irr7"] = np.random.normal(0, 1, 5000) X["irr8"] = np.random.poisson(1, 5000) X["irr9"] = np.random.binomial(1, 0.2, 5000) df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature # Make sure all selected variables are relevant for kept_feature in feat_rej: self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4']) self.assertGreater(len(feat_rej), 0) # Test type outputs for i in range(1, 5): row = df_bh.loc["rel{}".format(i)] self.assertEqual(row.feature, "rel{}".format(i)) if i == 1: self.assertEqual(row.type, "binary") else: self.assertEqual(row.type, "real") for i in range(1, 10): row = df_bh.loc["irr{}".format(i)] self.assertEqual(row.feature, "irr{}".format(i)) if i in [3, 6, 9]: self.assertEqual(row.type, "binary") else: self.assertEqual(row.type, "real") self.assertEqual(row.relevant, False)
def significane_test(): X = df.iloc[:, :-1] dummy = np.random.rand(len(df.index)) X['dummy'] = dummy y = df.iloc[:, -1] le = LabelEncoder() le.fit( ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio']) y_t = le.transform(y) y_t = pd.Series(y_t) print(calculate_relevance_table(X, y_t, ml_task='classification'))
def test_warning_for_no_relevant_feature( self, significance_test_feature_binary_mock, significance_test_feature_real_mock, X, y_real): significance_test_feature_binary_mock.return_value = 0.95 significance_test_feature_real_mock.return_value = 0.95 with mock.patch('logging.Logger.warning') as m: relevance_table = calculate_relevance_table(X, y_real, n_jobs=0, ml_task="regression") m.assert_called_with( 'No feature was found relevant for regression for fdr level = 0.05. ' 'Consider using a lower fdr level or other features.')
def test_all_features_bad(self): # Mixed case with real target y = pd.Series(np.random.normal(0, 1, 1000)) X = pd.DataFrame(index=range(1000)) X["irr1"] = np.random.binomial(0, 0.1, 1000) X["irr2"] = np.random.binomial(0, 0.15, 1000) X["irr3"] = np.random.binomial(0, 0.05, 1000) X["irr4"] = np.random.binomial(0, 0.2, 1000) X["irr5"] = np.random.binomial(0, 0.25, 1000) X["irr6"] = np.random.binomial(0, 0.01, 1000) df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature self.assertEqual(len(feat_rej), 0)
def test_all_features_bad(self): # Mixed case with real target y = pd.Series(np.random.normal(0, 1, 1000)) X = pd.DataFrame(index=range(1000)) X["irr1"] = np.random.binomial(0, 0.1, 1000) X["irr2"] = np.random.binomial(0, 0.15, 1000) X["irr3"] = np.random.binomial(0, 0.05, 1000) X["irr4"] = np.random.binomial(0, 0.2, 1000) X["irr5"] = np.random.binomial(0, 0.25, 1000) X["irr6"] = np.random.binomial(0, 0.01, 1000) df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature self.assertEqual(len(feat_rej), 0)
def test_warning_for_no_relevant_feature( self, significance_test_feature_binary_mock, significance_test_feature_real_mock, X, y_real): significance_test_feature_binary_mock.return_value = 0.95 significance_test_feature_real_mock.return_value = 0.95 with pytest.warns(RuntimeWarning) as record: _ = calculate_relevance_table(X, y_real, n_jobs=0, ml_task="regression", show_warnings=True) assert len(record) == 1 assert str(record[0].message) == ( "No feature was found relevant for regression for fdr level = 0.05 (which corresponds " "to the maximal percentage of irrelevant features, consider using an higher fdr level " "or add other features.")
def perform_fresh(X_train, y_train, X_test, y_test): log('Processing fresh') fresh_train_X, fresh_train_y = raw_to_tsfresh(X_train, y_train) fresh_test_X, fresh_test_y = raw_to_tsfresh(X_test, y_test) # Run the feature extraction and relevance tests ONLY on the train # data set. extracted_train = extract_features(fresh_train_X, column_id='id', column_value='value') extracted_train = extracted_train.dropna(axis='columns') # We run FRESH and its variants first at the default fdr level of 0.05, # but if it returns 0 features (why?) then we lower the value and try # again. filtered_train = None for fdr in [0.05, 0.01, 0.005, 0.001, 0.00001]: log('Using ' + str(fdr)) R = calculate_relevance_table(extracted_train, y_train.squeeze(), fdr_level=fdr) filtered_train = filter_features(extracted_train, R) if (filtered_train.shape[1] > 0): break # Extract features from the test set, but then apply the same relevant # features that we used from the train set extracted_test = extract_features(fresh_test_X, column_id='id', column_value='value') extracted_test = extracted_test.dropna(axis='columns') filtered_test = filter_features(extracted_test, R) # Train classifiers on the train set clf = build_rfc() trained_model = clf.fit(filtered_train, y_train.squeeze()) rfc_predicted = list(map(lambda v: int(v), clf.predict(filtered_test))) actual = y_test.squeeze().tolist() # Create and fit an AdaBoosted decision tree bdt = build_ada() trained_model = bdt.fit(filtered_train, y_train.squeeze()) ada_predicted = list(map(lambda v: int(v), bdt.predict(filtered_test))) return { 'rfc': accuracy_rate(rfc_predicted, actual), 'ada': accuracy_rate(ada_predicted, actual), 'rfc_count': len(clf.estimators_), 'ada_count': len(bdt.estimators_), }
def test_binomial_target_realvalued_features(self): # Real valued random variables and binomial target y = pd.Series(np.random.binomial(1, 0.5, 5000)) X = pd.DataFrame(index=range(5000)) for i in range(10): X["irr{}".format(i)] = np.random.normal(1, 0.3, 5000) for i in range(10, 20): X["irr{}".format(i)] = np.random.normal(1, 0.5, 5000) for i in range(20, 30): X["irr{}".format(i)] = np.random.normal(1, 0.8, 5000) X["rel1"] = y * np.random.normal(0, 1, 5000) + np.random.normal( 0, 1, 5000) X["rel2"] = y + np.random.normal(0, 1, 5000) X["rel3"] = y**2 + np.random.normal(0, 1, 5000) X["rel4"] = np.sqrt(y) + np.random.binomial(2, 0.1, 5000) df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature # Make sure all selected variables are relevant for kept_feature in feat_rej: self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4']) self.assertGreater(len(feat_rej), 0) # Test type outputs for i in range(1, 5): row = df_bh.loc["rel{}".format(i)] self.assertEqual(row.feature, "rel{}".format(i)) self.assertEqual(row.type, "real") for i in range(1, 30): row = df_bh.loc["irr{}".format(i)] self.assertEqual(row.feature, "irr{}".format(i)) self.assertEqual(row.type, "real") self.assertEqual(row.relevant, False)
def calculate_features_importance(self): x_train, y_train, x_test, y_test = self.create_train_test_data() Verbose.instance.print(1, f'Calculating importance for {len(self._features)} features') # x and y must to be pd.DataFrame and pd.Series for tsfresh x = pd.DataFrame(x_train) y = pd.Series(np.array(y_train).reshape(len(y_train)), index=x.index) t = calculate_relevance_table(x, y, **self._tsfresh_args).reindex(index=self._features) t.loc[t['relevant'] == False, 'p_value'] = t['p_value'].max() self._ranks = t['p_value'].tolist() self.logger.log({ 'date': datetime.datetime.now(), 'name': self.name, 'all_features': json.dumps(self._features), 'ranking': json.dumps(self.ranks) })
def calculate_feature_relevance(app_dir, valid_features, df_labels, postfixs): for df_label, postfix in zip(df_labels, postfixs): # calculate p-value excel_writer1 = pandas.ExcelWriter( os.path.join(app_dir, 'feature_p_value_%s.xlsx' % postfix)) try: df_p_value = calculate_relevance_table(valid_features, df_label['label'], ml_task='classification') except BaseException: print('EXCETPTION') return df_p_value.to_excel(excel_writer1) excel_writer1.save() # arrange data format valid_features.reset_index(inplace=True) filtered_feature = { 'id', } for _, row in df_p_value.iterrows(): if row['relevant']: filtered_feature.add(row['feature']) if len(filtered_feature) <= 1: # No correlated feature print('NO') return filtered_by_p = valid_features.loc[:, list(filtered_feature)] feature_with_label = pandas.merge(filtered_by_p, df_label, on='id').drop('id', 1) # calculate kendall correlation coefficient feature_kendall = feature_with_label.corr('kendall')['label'].drop( 'label', 0).to_frame() feature_kendall.rename( columns={'label': 'kendall_correlation_coefficient'}, inplace=True) excel_writer2 = pandas.ExcelWriter( os.path.join(app_dir, 'feature_kendall_%s.xlsx' % postfix)) df_kendall = pandas.DataFrame(feature_kendall) df_kendall.to_excel(excel_writer2) excel_writer2.save()
def test_binomial_target_realvalued_features(self): # Real valued random variables and binomial target y = pd.Series(np.random.binomial(1, 0.5, 5000)) X = pd.DataFrame(index=range(5000)) for i in range(10): X["irr{}".format(i)] = np.random.normal(1, 0.3, 5000) for i in range(10, 20): X["irr{}".format(i)] = np.random.normal(1, 0.5, 5000) for i in range(20, 30): X["irr{}".format(i)] = np.random.normal(1, 0.8, 5000) X["rel1"] = y * np.random.normal(0, 1, 5000) + np.random.normal(0, 1, 5000) X["rel2"] = y + np.random.normal(0, 1, 5000) X["rel3"] = y ** 2 + np.random.normal(0, 1, 5000) X["rel4"] = np.sqrt(y) + np.random.binomial(2, 0.1, 5000) df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature # Make sure all selected variables are relevant for kept_feature in feat_rej: self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4']) self.assertGreater(len(feat_rej), 0) # Test type outputs for i in range(1, 5): row = df_bh.loc["rel{}".format(i)] self.assertEqual(row.feature, "rel{}".format(i)) self.assertEqual(row.type, "real") for i in range(1, 30): row = df_bh.loc["irr{}".format(i)] self.assertEqual(row.feature, "irr{}".format(i)) self.assertEqual(row.type, "real") self.assertEqual(row.relevant, False)
def fit(self, X, y): """ Extract the information, which of the features are relevent using the given target. For more information, please see the :func:`~tsfresh.festure_selection.festure_selector.check_fs_sig_bh` function. All columns in the input data sample are treated as feature. The index of all rows in X must be present in y. :param X: data sample with the features, which will be classified as relevant or not :type X: pandas.DataFrame or numpy.array :param y: target vector to be used, to classify the features :type y: pandas.Series or numpy.array :return: the fitted estimator with the information, which features are relevant :rtype: FeatureSelector """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X.copy()) if not isinstance(y, pd.Series): y = pd.Series(y.copy()) relevance_table = calculate_relevance_table( X, y, ml_task=self.ml_task, n_jobs=self.n_jobs, chunksize=self.chunksize, fdr_level=self.fdr_level, hypotheses_independent=self.hypotheses_independent, test_for_binary_target_real_feature=self. test_for_binary_target_real_feature) self.relevant_features = relevance_table.loc[ relevance_table.relevant].feature.tolist() self.feature_importances_ = 1.0 - relevance_table.p_value.values self.p_values = relevance_table.p_value.values self.features = relevance_table.index.tolist() return self
def load(self, trial_ids, iid=True): X_s = [] y_s = [] for trial_id in trial_ids: devices = self._load_devices(trial_id) X = self._extract_features(devices, trial_id) y = pd.Series(data=self._create_reliability_label(devices)) X.sort_index(axis=1, inplace=True) if iid: idx_iid = y.iloc[::self.window_size].index.values X = X.loc[idx_iid] y = y.loc[idx_iid] X_s.append(X) y_s.append(y) X = pd.concat(X_s, sort=True) y = pd.concat(y_s) if self.selected_features is None: rel_table = calculate_relevance_table(X, y, n_jobs=N_JOBS) rel_table = rel_table.loc[rel_table['relevant'] == True] sorted_features = rel_table.sort_values(by='p_value') feature_names = sorted_features.index.tolist() if self.feature_limit is not None: feature_names = feature_names[:self.feature_limit] assert len(feature_names) == self.feature_limit X = X[feature_names] self.selected_features = feature_names else: X = X[self.selected_features] print("Data loaded for trials: " + ', '.join([str(x) for x in trial_ids])) print("X shape: {}, y shape: {}".format(X.shape, y.shape)) print_label_counts(y) # print("Features used: ") # from pprint import pprint # pprint(self.selected_features) return X, y
def select_features(X, y, test_for_binary_target_binary_feature=defaults.TEST_FOR_BINARY_TARGET_BINARY_FEATURE, test_for_binary_target_real_feature=defaults.TEST_FOR_BINARY_TARGET_REAL_FEATURE, test_for_real_target_binary_feature=defaults.TEST_FOR_REAL_TARGET_BINARY_FEATURE, test_for_real_target_real_feature=defaults.TEST_FOR_REAL_TARGET_REAL_FEATURE, fdr_level=defaults.FDR_LEVEL, hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT, n_jobs=defaults.N_PROCESSES, chunksize=defaults.CHUNKSIZE, ml_task='auto'): """ Check the significance of all features (columns) of feature matrix X and return a possibly reduced feature matrix only containing relevant features. The feature matrix must be a pandas.DataFrame in the format: +-------+-----------+-----------+-----+-----------+ | index | feature_1 | feature_2 | ... | feature_N | +=======+===========+===========+=====+===========+ | A | ... | ... | ... | ... | +-------+-----------+-----------+-----+-----------+ | B | ... | ... | ... | ... | +-------+-----------+-----------+-----+-----------+ | ... | ... | ... | ... | ... | +-------+-----------+-----------+-----+-----------+ | ... | ... | ... | ... | ... | +-------+-----------+-----------+-----+-----------+ | ... | ... | ... | ... | ... | +-------+-----------+-----------+-----+-----------+ Each column will be handled as a feature and tested for its significance to the target. The target vector must be a pandas.Series or numpy.array in the form +-------+--------+ | index | target | +=======+========+ | A | ... | +-------+--------+ | B | ... | +-------+--------+ | . | ... | +-------+--------+ | . | ... | +-------+--------+ and must contain all id's that are in the feature matrix. If y is a numpy.array without index, it is assumed that y has the same order and length than X and the rows correspond to each other. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features, select_features >>> df, y = load_robot_execution_failures() >>> X_extracted = extract_features(df, column_id='id', column_sort='time') >>> X_selected = select_features(X_extracted, y) :param X: Feature matrix in the format mentioned before which will be reduced to only the relevant features. It can contain both binary or real-valued features at the same time. :type X: pandas.DataFrame :param y: Target vector which is needed to test which features are relevant. Can be binary or real-valued. :type y: pandas.Series or numpy.ndarray :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused) :type test_for_binary_target_binary_feature: str :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature :type test_for_binary_target_real_feature: str :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused) :type test_for_real_target_binary_feature: str :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused) :type test_for_real_target_real_feature: str :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant features among all created features. :type fdr_level: float :param hypotheses_independent: Can the significance of the features be assumed to be independent? Normally, this should be set to False as the features are never independent (e.g. mean and median) :type hypotheses_independent: bool :param n_jobs: Number of processes to use during the p-value calculation :type n_jobs: int :param chunksize: The size of one chunk that is submitted to the worker process for the parallelisation. Where one chunk is defined as a singular time series for one id and one kind. If you set the chunksize to 10, then it means that one task is to calculate all features for 10 time series. If it is set it to None, depending on distributor, heuristics are used to find the optimal chunksize. If you get out of memory exceptions, you can try it with the dask distributor and a smaller chunksize. :type chunksize: None or int :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`. Defaults to `'auto'`, meaning the intended task is inferred from `y`. If `y` has a boolean, integer or object dtype, the task is assumend to be classification, else regression. :type ml_task: str :return: The same DataFrame as X, but possibly with reduced number of columns ( = features). :rtype: pandas.DataFrame :raises: ``ValueError`` when the target vector does not fit to the feature matrix or `ml_task` is not one of `'auto'`, `'classification'` or `'regression'`. """ assert isinstance(X, pd.DataFrame), "Please pass features in X as pandas.DataFrame." check_for_nans_in_columns(X) assert isinstance(y, (pd.Series, np.ndarray)), "The type of target vector y must be one of: " \ "pandas.Series, numpy.ndarray" assert len(y) > 1, "y must contain at least two samples." assert len(X) == len(y), "X and y must contain the same number of samples." assert len(set(y)) > 1, "Feature selection is only possible if more than 1 label/class is provided" if isinstance(y, pd.Series) and set(X.index) != set(y.index): raise ValueError("Index of X and y must be identical if provided") if isinstance(y, np.ndarray): y = pd.Series(y, index=X.index) relevance_table = calculate_relevance_table( X, y, ml_task=ml_task, n_jobs=n_jobs, chunksize=chunksize, test_for_binary_target_real_feature=test_for_binary_target_real_feature, fdr_level=fdr_level, hypotheses_independent=hypotheses_independent, ) relevant_features = relevance_table[relevance_table.relevant].feature return X.loc[:, relevant_features]
def fit(self, X, y): """ Extract the information, which of the features are relevant using the given target. For more information, please see the :func:`~tsfresh.festure_selection.festure_selector.check_fs_sig_bh` function. All columns in the input data sample are treated as feature. The index of all rows in X must be present in y. :param X: data sample with the features, which will be classified as relevant or not :type X: pandas.DataFrame or numpy.array :param y: target vector to be used, to classify the features :type y: pandas.Series or numpy.array :return: the fitted estimator with the information, which features are relevant :rtype: FeatureSelector """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X.copy()) if not isinstance(y, pd.Series): y = pd.Series(y.copy()) relevance_table = calculate_relevance_table( X, y, ml_task=self.ml_task, multiclass=self.multiclass, n_significant=self.n_significant, n_jobs=self.n_jobs, chunksize=self.chunksize, fdr_level=self.fdr_level, hypotheses_independent=self.hypotheses_independent, test_for_binary_target_real_feature=self. test_for_binary_target_real_feature, ) self.relevant_features = relevance_table.loc[ relevance_table.relevant].feature.tolist() if self.multiclass: p_values_table = relevance_table.filter(regex="^p_value_*", axis=1) if self.multiclass_p_values == "all": self.p_values = p_values_table self.feature_importances_ = 1.0 - p_values_table self.feature_importances_.columns = ( self.feature_importances_.columns.str.lstrip("p_value")) self.feature_importances_ = self.feature_importances_.add_prefix( "importance_") elif self.multiclass_p_values == "min": self.p_values = p_values_table.min(axis=1).values elif self.multiclass_p_values == "max": self.p_values = p_values_table.max(axis=1).values elif self.multiclass_p_values == "avg": self.p_values = p_values_table.mean(axis=1).values if self.multiclass_p_values != "all": # raise p_values to the power of n_significant to increase importance # of features which are significant for more classes self.feature_importances_ = ( 1.0 - self.p_values**relevance_table.n_significant.values) else: self.feature_importances_ = 1.0 - relevance_table.p_value.values self.p_values = relevance_table.p_value.values self.features = relevance_table.index.tolist() return self
def select_features(X, y, test_for_binary_target_binary_feature=defaults. TEST_FOR_BINARY_TARGET_BINARY_FEATURE, test_for_binary_target_real_feature=defaults. TEST_FOR_BINARY_TARGET_REAL_FEATURE, test_for_real_target_binary_feature=defaults. TEST_FOR_REAL_TARGET_BINARY_FEATURE, test_for_real_target_real_feature=defaults. TEST_FOR_REAL_TARGET_REAL_FEATURE, fdr_level=defaults.FDR_LEVEL, hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT, n_jobs=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS, chunksize=defaults.CHUNKSIZE, ml_task='auto'): """ Check the significance of all features (columns) of feature matrix X and return a possibly reduced feature matrix only containing relevant features. The feature matrix must be a pandas.DataFrame in the format: +-------+-----------+-----------+-----+-----------+ | index | feature_1 | feature_2 | ... | feature_N | +=======+===========+===========+=====+===========+ | A | ... | ... | ... | ... | +-------+-----------+-----------+-----+-----------+ | B | ... | ... | ... | ... | +-------+-----------+-----------+-----+-----------+ | ... | ... | ... | ... | ... | +-------+-----------+-----------+-----+-----------+ | ... | ... | ... | ... | ... | +-------+-----------+-----------+-----+-----------+ | ... | ... | ... | ... | ... | +-------+-----------+-----------+-----+-----------+ Each column will be handled as a feature and tested for its significance to the target. The target vector must be a pandas.Series or numpy.array in the form +-------+--------+ | index | target | +=======+========+ | A | ... | +-------+--------+ | B | ... | +-------+--------+ | . | ... | +-------+--------+ | . | ... | +-------+--------+ and must contain all id's that are in the feature matrix. If y is a numpy.array without index, it is assumed that y has the same order and length than X and the rows correspond to each other. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features, select_features >>> df, y = load_robot_execution_failures() >>> X_extracted = extract_features(df, column_id='id', column_sort='time') >>> X_selected = select_features(X_extracted, y) :param X: Feature matrix in the format mentioned before which will be reduced to only the relevant features. It can contain both binary or real-valued features at the same time. :type X: pandas.DataFrame :param y: Target vector which is needed to test which features are relevant. Can be binary or real-valued. :type y: pandas.Series or numpy.ndarray :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused) :type test_for_binary_target_binary_feature: str :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature :type test_for_binary_target_real_feature: str :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused) :type test_for_real_target_binary_feature: str :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused) :type test_for_real_target_real_feature: str :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant features among all created features. :type fdr_level: float :param hypotheses_independent: Can the significance of the features be assumed to be independent? Normally, this should be set to False as the features are never independent (e.g. mean and median) :type hypotheses_independent: bool :param n_jobs: Number of processes to use during the p-value calculation :type n_jobs: int :param show_warnings: Show warnings during the p-value calculation (needed for debugging of calculators). :type show_warnings: bool :param chunksize: The size of one chunk that is submitted to the worker process for the parallelisation. Where one chunk is defined as the data for one feature. If you set the chunksize to 10, then it means that one task is to filter 10 features. If it is set it to None, depending on distributor, heuristics are used to find the optimal chunksize. If you get out of memory exceptions, you can try it with the dask distributor and a smaller chunksize. :type chunksize: None or int :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`. Defaults to `'auto'`, meaning the intended task is inferred from `y`. If `y` has a boolean, integer or object dtype, the task is assumend to be classification, else regression. :type ml_task: str :return: The same DataFrame as X, but possibly with reduced number of columns ( = features). :rtype: pandas.DataFrame :raises: ``ValueError`` when the target vector does not fit to the feature matrix or `ml_task` is not one of `'auto'`, `'classification'` or `'regression'`. """ assert isinstance( X, pd.DataFrame), "Please pass features in X as pandas.DataFrame." check_for_nans_in_columns(X) assert isinstance(y, (pd.Series, np.ndarray)), "The type of target vector y must be one of: " \ "pandas.Series, numpy.ndarray" assert len(y) > 1, "y must contain at least two samples." assert len(X) == len(y), "X and y must contain the same number of samples." assert len( set(y) ) > 1, "Feature selection is only possible if more than 1 label/class is provided" if isinstance(y, pd.Series) and set(X.index) != set(y.index): raise ValueError("Index of X and y must be identical if provided") if isinstance(y, np.ndarray): y = pd.Series(y, index=X.index) relevance_table = calculate_relevance_table( X, y, ml_task=ml_task, n_jobs=n_jobs, show_warnings=show_warnings, chunksize=chunksize, test_for_binary_target_real_feature=test_for_binary_target_real_feature, fdr_level=fdr_level, hypotheses_independent=hypotheses_independent, ) relevant_features = relevance_table[relevance_table.relevant].feature return X.loc[:, relevant_features]
def test_binary_target_binary_features(self): # Binomial random variables and binomial target y = pd.Series(np.random.binomial(1, 0.5, 5000)) X = pd.DataFrame(index=range(5000)) for i in range(10): X["irr{}".format(i)] = np.random.binomial(1, 0.1, 5000) for i in range(10, 20): X["irr{}".format(i)] = np.random.binomial(1, 0.8, 5000) z = y - np.random.binomial(1, 0.01, 5000) + np.random.binomial( 1, 0.01, 5000) z[z == -1] = 0 z[z == 2] = 1 X["rel1"] = z z = y - np.random.binomial(1, 0.05, 5000) + np.random.binomial( 1, 0.05, 5000) z[z == -1] = 0 z[z == 2] = 1 X["rel2"] = z z = y - np.random.binomial(1, 0.10, 5000) + np.random.binomial( 1, 0.10, 5000) z[z == -1] = 0 z[z == 2] = 1 X["rel3"] = z z = y - np.random.binomial(1, 0.15, 5000) + np.random.binomial( 1, 0.15, 5000) z[z == -1] = 0 z[z == 2] = 1 X["rel4"] = z z = y - np.random.binomial(1, 0.20, 5000) + np.random.binomial( 1, 0.20, 5000) z[z == -1] = 0 z[z == 2] = 1 X["rel5"] = z df_bh = calculate_relevance_table(X, y) feat_rej = df_bh.loc[df_bh.relevant].feature # Make sure all selected variables are relevant for kept_feature in feat_rej: self.assertIn(kept_feature, ['rel1', 'rel2', 'rel3', 'rel4', 'rel5']) self.assertGreater(len(feat_rej), 0) # Test type outputs for i in range(1, 6): row = df_bh.loc["rel{}".format(i)] self.assertEqual(row.feature, "rel{}".format(i)) self.assertEqual(row.type, "binary") for i in range(1, 20): row = df_bh.loc["irr{}".format(i)] self.assertEqual(row.feature, "irr{}".format(i)) self.assertEqual(row.type, "binary") self.assertEqual(row.relevant, False)
def fit(self, train_files): window_data, shap_window_data = self.prep_data(train_files) # Extract clinical variables clin_features = [] for file in train_files: names, values = self.read_clin_fn(file) clin_features.append([file] + values) clin_df = pd.DataFrame(clin_features, columns=['file'] + names) clin_df = self.proc_clin_fn(clin_df) # Extract features for each channel separately features_per_channel = [] self.feature_extractors_per_channel = {} for ch in range(window_data.windows.shape[1]): self.feature_extractors_per_channel[ch] = [] for feature_extractor in self.features: self.feature_extractors_per_channel[ch].append( feature_extractor()) channel_features = [] for f in self.feature_extractors_per_channel[ch]: features = f.fit_transform(window_data.windows[:, ch, :], window_data.labels) features = pd.DataFrame( features, columns=['{}_ch{}'.format(x, ch) for x in f.names_]) channel_features.append(features) features_per_channel.append(pd.concat(channel_features, axis=1)) short_features_per_channel = [] self.short_feature_extractors_per_channel = {} for ch in range(window_data.windows.shape[1]): self.short_feature_extractors_per_channel[ch] = [] for feature_extractor in self.short_features: self.short_feature_extractors_per_channel[ch].append( feature_extractor()) channel_features = [] for f in self.short_feature_extractors_per_channel[ch]: f.fit(shap_window_data.windows[:, ch, :], shap_window_data.labels) features = f.transform(window_data.windows[:, ch, :], window_data.labels) features = pd.DataFrame( features, columns=['{}_ch{}'.format(x, ch) for x in f.names_]) channel_features.append(features) short_features_per_channel.append( pd.concat(channel_features, axis=1)) features_multi_channel = [] for f in self.multi_channel_features: features = f.fit_transform(window_data.windows, window_data.labels) features = pd.DataFrame( features, columns=['{}_ch{}'.format(x, ch) for x in f.names_]) features_multi_channel.append(features) # Concatenate the features of different channels together train_features = pd.concat(features_per_channel + short_features_per_channel + features_multi_channel, axis=1) train_features['file'] = window_data.files train_features = train_features.merge(clin_df, on='file') # Create our X and y X_train = train_features y_train = np.array(window_data.labels) for col in ['ID', 'file']: if col in X_train.columns: X_train = X_train.drop(col, axis=1) X_train = X_train.astype(float) # useless_features = self.remove_features(X_train) # X_train = X_train.drop(useless_features, axis=1) # Now apply hypothesis testing on remaining features rel_table = calculate_relevance_table(X_train, pd.Series(y_train)) self.rel_features = list(rel_table[rel_table['p_value'] <= 0.05].index) X_train = X_train[self.rel_features] # Create validation set for early stopping val_files = np.random.choice(train_files, size=int(0.1 * len(train_files)), replace=False) all_files = np.array(window_data.files) X_val = X_train.loc[np.isin(window_data.files, val_files), :] y_val = y_train[np.isin(window_data.files, val_files)] X_train = X_train.loc[~np.isin(window_data.files, val_files), :] y_train = y_train[~np.isin(window_data.files, val_files)] # Fit our gradient boosting classifier self.clf = CatBoostClassifier( iterations=10000, od_type='Iter', od_wait=50, objective='CrossEntropy', random_seed=2018, #eval_metric='AUC', use_best_model=True, task_type='CPU') self.clf.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=100) return train_features
def test_multiclass_requires_classification(self, X, y_real): with pytest.raises(AssertionError): calculate_relevance_table(X, y_real, multiclass=True, ml_task="regression")
def test_multiclass_relevance_table_columns(self, X, y_binary): y = y_binary.copy() y[2] = 2 relevance_table = calculate_relevance_table(X, y, multiclass=True) assert len(relevance_table.columns) == 10
def test_restrict_ml_task_options(self, X, y_binary): with pytest.raises(ValueError): calculate_relevance_table(X, y_binary, ml_task='some_other_task')
def test_restrict_ml_task_options(self, X, y_binary): with pytest.raises(ValueError): calculate_relevance_table(X, y_binary, ml_task='some_other_task')