def featureSelection(reduced_features, labels, clnd_features, percentile, n_components, results=False): """ Parameters: reduced_features = Unique feature names in python list after dropping non-numeric feaures. labels = ground truth labels for the data points. clnd_features = data point features in numpy array format corresponding to the labels. percentile= the parameter for the SelectPercentile method; between 0.0-1.0. n_components = the n_components for the pca. results = False returns python list of selected features. If True returns the metrics of the feature selectors (F-statistic, and p-values from f_classif) and the top 'n' pca component variance measurements. Output: Resulting list of feature from the SelectPercentile function and the number of principle components used. If p_results = True then the statistics of the SelectPercentile method using f_classif will be printed. In addition the explained variance of the top 'x' principle components will also be printed. """ from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.decomposition import PCA from itertools import compress selector = SelectPercentile(f_classif, percentile=percentile) selector.fit_transform(clnd_features, labels) pca = PCA(n_components=n_components) pca.fit_transform(clnd_features, labels) if results == True: f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\ key = lambda x: x[1], reverse=True) p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\ key = lambda x: x[1]) expl_var = pca.explained_variance_ratio_ return f_stat, p_vals, expl_var else: ## return a boolean index of the retained features retained_features = selector.get_support() ## index the original features by the boolean index of top x% features ## return a python list of the features to be used for training features_list = list(compress(reduced_features[1:], retained_features)) ## add back in the 'poi' to the first position in the final features list features_list.insert(0, 'poi') return features_list
def test_f_classif_constant_feature(): # Test that f_classif warns if a feature is constant throughout. X, y = make_classification(n_samples=10, n_features=5) X[:, 0] = 2.0 with pytest.warns(UserWarning): f_classif(X, y)
def test_f_classif(): # Test whether the F test yields meaningful results # on a simple simulated classification problem X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) F, pv = f_classif(X, y) F_sparse, pv_sparse = f_classif(sparse.csr_matrix(X), y) assert_true((F > 0).all()) assert_true((pv > 0).all()) assert_true((pv < 1).all()) assert_true((pv[:5] < 0.05).all()) assert_true((pv[5:] > 1.0e-4).all()) assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv)
def test_f_classif(): """ Test whether the F test yields meaningful results on a simple simulated classification problem """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) F, pv = f_classif(X, y) F_sparse, pv_sparse = f_classif(sparse.csr_matrix(X), y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.e-4).all() assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv)
def SignificanceMatrix(data): col = data.columns colTypes = [ check_type(x) for x in data.dtypes ] relationMatrix = pd.DataFrame(index=col,columns=col) for i in range(len(col)): for j in range(i, len(col)): if i==j: pval = 1 relationMatrix.loc[col[i],col[j]] = pval else: tempdata = data[[col[i],col[j]]] tempdata = tempdata.dropna(axis=0) #Remeber to add warning where missing data is removed col1 = tempdata[col[i]] col2 = tempdata[col[j]].ravel() # print tempdata.dtypes # print colTypes[i],colTypes[j] if colTypes[i] == colTypes[j]: if colTypes[i] == "continuous": # print "both cont" pval = np.round(feature_selection.f_regression(pd.DataFrame(col1),col2)[1][0],3) else: pval = chisq_independence(tempdata[col[i]],tempdata[col[j]]) else: if colTypes[i] == "continuous": pval = np.round(feature_selection.f_classif(pd.DataFrame(col1),col2)[1][0],3) else: pval = np.round(feature_selection.f_classif(pd.DataFrame(col2),col1)[1][0],3) relationMatrix.loc[col[i],col[j]] = pval relationMatrix.loc[col[j],col[i]] = pval return relationMatrix.fillna("NAN")
def compute_fscore(data_set_df, user_info_df, label='gender', min_not_nan=-1): df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) feature_fs = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance']) i = 0 for index, values in df_filtered.iterrows(): try: if min_not_nan < 0: f_score, p_val = f_classif(values.fillna(values.mean())[:, np.newaxis], y_v) feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan else: nan_removed = values.dropna() if len(nan_removed) < min_not_nan: feature_fs.loc[index] = np.nan else: f_score, p_val = f_classif(nan_removed[:, np.newaxis], y_v[nan_removed.index.astype(int)]) feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan if float(i) % 10000 == 0 and i > 0: print "\t\t\t%s features are done" % i i += 1 # print index, feature_fs.loc[index].values[0] except ValueError: # print "value error occurs during processing %r" % index continue feature_fs.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_fs
def perform_ANOVA(blind=False): if blind == False: fval, pval = f_classif(res.data, res.target) else: fval, pval = f_classif(b_res.data, b_res.target) print('Attribute,f-value,p-value') for i in range(len(pval)): print(res.attributes[i + 3] + ',' + str(fval[i]) + ',' + str(pval[i]))
def FStat(vals, labels): vals = np.array(vals) if len(np.shape(vals)) == 1: (f, p) = fs.f_classif(vals.reshape(-1, 1), labels) return f[0] else: (f, p) = fs.f_classif(vals, labels) return f
def main(): # parse params: parser = argparse.ArgumentParser() parser.add_argument( '--model_path', default='./Skipthoughts-2018_01_13-13_11_19-13.726/model.pt', type=str) parser.add_argument( '--books_path', default='/Users/mike/GitRepos/potter/data/other/books_txt_full', type=str) parser.add_argument('--tokenize', action='store_true') parser.add_argument( '--dict_path', default='./Skipthoughts-2018_01_13-13_11_19-13.726/model.dict.pt', type=str) parser.add_argument('--gpu', action='store_true') parser.add_argument('--max_sent_len', default=25, type=int) parser.add_argument('--min_sent_len', default=5, type=int) parser.add_argument('--sents_per_book', default=100, type=int) parser.add_argument('--books_per_genre', default=20, type=int) parser.add_argument('--nb_top_features', default=5, type=int) args = parser.parse_args() # load model and dict #model = u.load_model(args.model_path) #vocab_dict = u.load_model(args.dict_path) data = make_dataframe(args, model=None, vocab=None) X = data.filter(regex='neur') print('=======\n single-neuron binary classification (one vs rest):') for genre in set(data['genre']): print(f' Testing genre {genre}:') y = [int(i) for i in data.genre == genre] # univariate feature selection with F-test for feature scoring F, pval = f_classif(X, y) max_idxs = np.argsort(F)[::-1][:args.nb_top_features] neuron_names, neuron_f_scores = np.array( X.columns)[max_idxs], F[max_idxs] for name, score in zip(neuron_names, neuron_f_scores): print(f' {name} -> {score:.2f} F-score') # categorical case: print('=======\n single-neuron binary genre classification:') F, pval = f_classif(X, data['genre']) max_idxs = np.argsort(F)[::-1][:args.nb_top_features] neuron_names, neuron_f_scores = np.array(X.columns)[max_idxs], F[max_idxs] for name, score in zip(neuron_names, neuron_f_scores): print(f' {name} -> {score:.2f} F-score') print('=======\n all-neuron genre classification (5-fold CV):')
def pvalue(path): ''' Calculate P-value ''' # read extracted features amigos_data = np.loadtxt(os.path.join(path, 'mpe', 'mpe_features.csv'), delimiter=',') # amigos_data = amigos_data[:, :500] # EEG # amigos_data = amigos_data[:, 500:515] # ECG amigos_data = amigos_data[:, 515:] # GSR # read labels and split to 0 and 1 by a_labels, v_labels = read_labels(os.path.join(path, 'label.csv')) # calculate p-value _, a_pvalues = f_classif(amigos_data, a_labels) _, v_pvalues = f_classif(amigos_data, v_labels) # arousal sel_idx = np.argsort(a_pvalues)[:20] a_saved_name = [] for idx in sel_idx: a_saved_name.append(FEATURE_NAMES[idx]) # valence sel_idx = np.argsort(v_pvalues)[:0] v_saved_name = [] for idx in sel_idx: v_saved_name.append(FEATURE_NAMES[idx]) with open('data/s_gsr_rcmpe_name', 'w') as f: for name in a_saved_name: f.write("{}\n".format(name)) for name in v_saved_name: f.write("{}\n".format(name)) print('Arousal') for idx in np.argsort(a_pvalues)[:3]: print(FEATURE_NAMES[idx], a_pvalues[idx]) print('Valence') for idx in np.argsort(v_pvalues)[:3]: print(FEATURE_NAMES[idx], v_pvalues[idx]) print('\nUse Arousal Labels') print("Number of features (p < 0.05): {}".format( a_pvalues[a_pvalues < 0.05].size)) for i in range(a_pvalues[a_pvalues < 0.05].size): print("Features: {}, Value: {:.4f}".format( FEATURE_NAMES[np.where(a_pvalues < 0.05)[0][i]], a_pvalues[np.where(a_pvalues < 0.05)[0][i]])) print('\nUse Valence Labels') print("Number of features (p < 0.05): {}".format( v_pvalues[v_pvalues < 0.05].size)) for i in range(v_pvalues[v_pvalues < 0.05].size): print("Features: {}, Value: {:.4f}".format( FEATURE_NAMES[np.where(v_pvalues < 0.05)[0][i]], v_pvalues[np.where(v_pvalues < 0.05)[0][i]]))
def featureSelection(reduced_features,labels,clnd_features,percentile,n_components,results=False): """ Parameters: reduced_features = Unique feature names in python list after dropping non-numeric feaures. labels = ground truth labels for the data points. clnd_features = data point features in numpy array format corresponding to the labels. percentile= the parameter for the SelectPercentile method; between 0.0-1.0. n_components = the n_components for the pca. results = False returns python list of selected features. If True returns the metrics of the feature selectors (F-statistic, and p-values from f_classif) and the top 'n' pca component variance measurements. Output: Resulting list of feature from the SelectPercentile function and the number of principle components used. If p_results = True then the statistics of the SelectPercentile method using f_classif will be printed. In addition the explained variance of the top 'x' principle components will also be printed. """ from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.decomposition import PCA from itertools import compress selector = SelectPercentile(f_classif, percentile=percentile) selector.fit_transform(clnd_features, labels) pca = PCA(n_components = n_components) pca.fit_transform(clnd_features, labels) if results == True: f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\ key = lambda x: x[1], reverse=True) p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\ key = lambda x: x[1]) expl_var = pca.explained_variance_ratio_ return f_stat,p_vals,expl_var else: ## return a boolean index of the retained features retained_features = selector.get_support() ## index the original features by the boolean index of top x% features ## return a python list of the features to be used for training features_list = list(compress(reduced_features[1:],retained_features)) ## add back in the 'poi' to the first position in the final features list features_list.insert(0,'poi') return features_list
def plot_f_test(all_ds, ft_names="MRIQC", save_path=None): margin = 0 size = len(all_ds) plt.figure(figsize=(20, 15)) for ds_name, data in all_ds.items(): x, y = data["x"], data["y"] if "site" in x.columns: x = data["x"].copy() x["site"] = encode_vals(x["site"]) x_data, y_data = x.values, y.values fval, pval = f_classif(x_data, y_data) ticks = np.array(range(0, size * len(fval) + len(fval), size + 1)) #### F VALUE #plt.figure(figsize=(20, 15)) plt.barh(ticks + margin, fval, height=.50, label=ds_name) plt.yticks(ticks + margin, x.columns, rotation="horizontal") plt.ylabel("Feature") plt.xlabel("F-values") margin += 1.0 plt.title( "F-value of the {} features for various datasets".format(ft_names)) plt.legend() if save_path: plt.savefig(opj(save_path, "fvals_{}_{}.png".format(ds_name, ft_names))) plt.figure(figsize=(20, 15)) margin = 0 for ds_name, data in all_ds.items(): x, y = data["x"], data["y"] if "site" in x.columns: x = data["x"].copy() x["site"] = encode_vals(x["site"]) x_data, y_data = x.values, y.values fval, pval = f_classif(x_data, y_data) ticks = np.array(range(0, size * len(pval) + len(pval), size + 1)) #### F VALUE #plt.figure(figsize=(20, 15)) plt.barh(ticks + margin, 1 - pval, height=.50, label=ds_name) plt.yticks(ticks + margin, x.columns, rotation="horizontal") plt.ylabel("Feature") plt.xlabel("1-P-value") margin += 1.0 plt.title( "P-values of the {} features for various datasets".format(ft_names)) plt.legend() if save_path: plt.savefig(opj(save_path, "pvals_{}_{}.png".format(ds_name, ft_names)))
def fit(self, X, y): if self.type == "f_value": vals = f_classif(X, y)[0] self.index = np.where(vals < self.threshold)[0] if self.type == "p_value": vals = f_classif(X, y)[1] self.index = np.where(vals < self.threshold)[0] if self.type == "mutual_info": vals = mutual_info_classif(X, y) self.index = np.where(vals > self.threshold)[0] if self.type == "chi2": vals = chi2(X, y)[0] self.index = np.where(vals < self.threshold)[0] return self
def return_2allele_Pval_df(assoc_test_type, two_clusterPercent_df, two_clst_df, X_allez, y1): two_clusterPval_df = two_clusterPercent_df.copy() for cl1, row in two_clusterPercent_df.iterrows(): for cl2 in row.index: # print cl1, cl2 # print len(cl1.split("_")), len(cl2.split("_")[1]) if cl1.split("_")[0] != cl2.split("_")[0]: if len(cl1.split("_")[1]) == 0: if len(cl2.split("_")[1]) > 0: allele_pVal_dict = allele_cooccurence_pValue( cl1, cl2, y1, X_allez, assoc_test_type) two_clusterPval_df.loc[ cl1, cl2] = allele_pVal_dict[cl2]["pVal"] elif len(cl2.split("_")[1]) == 0: if len(cl1.split("_")[1]) > 0: allele_pVal_dict = allele_cooccurence_pValue( cl1, cl2, y1, X_allez, assoc_test_type) two_clusterPval_df.loc[ cl1, cl2] = allele_pVal_dict[cl1]["pVal"] else: if two_clst_df.loc[cl1, cl2] != "-": allele_pVal_dict = allele_cooccurence_pValue( cl1, cl2, y1, X_allez, assoc_test_type) two_clusterPval_df.loc[ cl1, cl2] = allele_pVal_dict["cooccurence"]["pVal"] elif cl1.split("_")[0] == cl2.split("_")[0] and len( cl2.split("_")[1]) > 0: if assoc_test_type == "chi2": test_single, pVal_single = chi2(X_allez[[cl2, cl2]], y1) elif assoc_test_type == "f_classif": test_single, pVal_single = f_classif( X_allez[[cl2, cl2]], y1) two_clusterPval_df.loc[cl1, cl2] = pVal_single[0] elif cl1.split("_")[0] == cl2.split("_")[0] and len( cl1.split("_")[1]) > 0: if assoc_test_type == "chi2": test_single, pVal_single = chi2(X_allez[[cl1, cl1]], y1) elif assoc_test_type == "f_classif": test_single, pVal_single = f_classif( X_allez[[cl1, cl1]], y1) two_clusterPval_df.loc[cl1, cl2] = pVal_single[0] return two_clusterPval_df
def train_test(x_train, x_test, y_train, y_test): select = SelectPercentile(percentile=75) select.fit(x_train, y_train) x_train_selected = select.transform(x_train) f, p = f_classif(x_train, y_train) plt.figure() plt.plot(p, 'o') plt.xlabel('Counts of Features') plt.ylabel('F-score') plt.title('The sample distribution of F-Test') plt.show() mask = select.get_support() plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.title('Feature selection percentage') plt.show() x_test_selected = select.transform(x_test) clf = tree.DecisionTreeClassifier() clf = clf.fit(x_train_selected, y_train) tree_pre = clf.predict(x_test_selected) tree.plot_tree(clf) plt.title('Tree model plot') plt.show() print("Decision Tree Classifier Accuracy - " + str(100 * accuracy_score(tree_pre, y_test)) + "%") ra = RandomForestClassifier(n_estimators=500, n_jobs=5, max_depth=50, random_state=0) ra.fit(x_train_selected, y_train) ra_pre = ra.predict(x_test_selected) print("Random Forest Accuracy - " + str(100 * accuracy_score(ra_pre, y_test)) + "%")
def main(file, oligosList): data = pd.read_csv(file, sep='\t', header = None) data = pd.DataFrame(data) data = data.dropna(axis=1,how='all') classifier = data.tail(1) del classifier[0] classifier = classifier.unstack() dataCropped = pd.DataFrame([]) for i in oligosList: for j in data[0]: if j == i: dt = data.loc[data[0] == i] dataCropped = pd.concat([dataCropped, dt]) data = pd.DataFrame.transpose(dataCropped) #Removing oligo names from dataset data = data.drop(data.index[0]) #For selecting a number of best features: #result = SelectKBest(f_classif, k="all").fit_transform(data, classifier) result = f_classif(data, classifier) Fval = result[0] Pval = result[1] result = pd.DataFrame(data = [Fval, Pval], index = ['F-score', 'p-value'], columns = oligosList) #print(result) outputFile(result, file, oligosList) closeFunc()
def select(self, dataframe: 'pd.DataFrame', y_column: str) -> list: ''' Selecting the most important columns :param dataframe: pandas DataFrame Data Frame on which the algorithm is applied :param y_column: str The column name of the value that we what to predict :return: list The list of features that are selected by the algorithm as the best one ''' # Defining the list with names of columns except the predicted one X_columns = [col for col in dataframe.columns if col != y_column] # Creating the F and p-value history dictionaries self.F_history = {} self.p_value_history = {} for col in X_columns: self.F_history[col] = [] self.p_value_history[col] = [] # Defining the feature states feature_state = list(np.ones(len(X_columns))) while True: self.iter += 1 # Extracting the selected columns X_cols = self.bin_to_cols(feature_state, X_columns) X = dataframe[X_cols].values y = dataframe[y_column].values # Choosing different strategy depending whatever it is a classification or regression. if self.classification: F_vals, p_vals = f_classif(X, y) else: F_vals, p_vals = f_regression(X, y) index = 0 for col in X_columns: if col in X_cols: self.F_history[col].append(float(F_vals[index])) self.p_value_history[col].append(float(p_vals[index])) index += 1 else: self.F_history[col].append(-1) self.p_value_history[col].append(-1) # Choosing the max value of p-value max_PValue = max(p_vals) # Erasing the column with the p-value equal with the max value of the p-value, if the max value is # higher than significance level if max_PValue > self.significance_level: for j in range(len(X_cols)): if p_vals[j].astype(float) == max_PValue: feature_state[X_columns.index(X_cols[j])] = 0 else: break # Returning the chose columns. self.choosed_cols = self.bin_to_cols(feature_state, X_columns) return self.choosed_cols
def _SelectKBest(self, X, y): print('Selecting K Best from whole image') from sklearn.feature_selection import SelectKBest, f_classif # ### Define the dimension reduction to be used. # Here we use a classical univariate feature selection based on F-test, # namely Anova. The number of features to be selected is set to 784 feature_selection = SelectKBest(f_classif, k=self.k_features) feature_selection.fit(X, y) scores = f_classif(X, y)[0] mask_k_best = np.zeros(scores.shape, dtype=bool) mask_k_best[np.argsort(scores, kind="mergesort")[-self.k_features:]]\ = 1 import nibabel mask_brain_img = nibabel.load(self.mask_non_brain).get_data() mask_brain = mask_brain_img.flatten().astype(bool) roi = np.zeros(mask_brain.flatten().shape) roi[mask_brain] = mask_k_best roi = roi.reshape(mask_brain_img.shape) img = nibabel.Nifti1Image(roi, np.eye(4)) img.to_filename('/tmp/best.nii.gz') print('SelectKBest data reduction from: %s' % str(X.shape)) X = feature_selection.transform(X) print('SelectKBest data reduction to: %s' % str(X.shape)) self.feature_reduction_method = feature_selection return X
def test_randomized_logistic(): # Check randomized sparse logistic regression iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) X_orig = X.copy() feature_scores = clf.fit(X, y).scores_ assert_array_equal(X, X_orig) # fit does not modify X assert_array_equal(np.argsort(F), np.argsort(feature_scores)) clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5], random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_array_equal(np.argsort(F), np.argsort(feature_scores)) clf = RandomizedLogisticRegression(verbose=False, C=[[1., 0.5]]) assert_raises(ValueError, clf.fit, X, y)
def compute_scoring_func(self, func): if func == 'variance': features = self.instances.features.get_values() annotations = self.instances.annotations.get_labels() if isinstance(features, spmatrix): variance = mean_variance_axis(features, axis=0)[1] else: variance = features.var(axis=0) return variance, None features = self.annotated_instances.features.get_values() annotations = self.annotated_instances.annotations.get_supervision( self.multiclass) if func == 'f_classif': return f_classif(features, annotations) elif func == 'mutual_info_classif': if isinstance(features, spmatrix): discrete_indexes = True else: features_types = self.instances.features.info.types discrete_indexes = [ i for i, t in enumerate(features_types) if t == FeatureType.binary ] if not discrete_indexes: discrete_indexes = False return (mutual_info_classif(features, annotations, discrete_features=discrete_indexes), None) elif func == 'chi2': return chi2(features, annotations) else: assert (False)
def generateStopWordsText(new_sample_set, sub_num): """生成stop word文件,并返回其文件名 """ stopWord_file = "model\\stopWordDoc_%d.pkl" % sub_num y, texts = [], [] for eachline in new_sample_set: label, string = eachline.strip('\n').split('\t', 1) y.append(label) texts.append(' '.join(jieba.cut(string))) # String -> feature vector Tfidf_vectorizer = TfidfVectorizer() #建立 tf-idf 特征生成器 Tfidf_vectorizer.fit(texts) # 拟合 (建模时应该保存) X = Tfidf_vectorizer.transform(texts) words_dict = Tfidf_vectorizer.vocabulary_ # 词位置dict words_list = list( map(lambda wc: wc[0], sorted(words_dict.items(), key=lambda asd: asd[1], reverse=False))) f_score, p_val = f_classif(X, y) # Anova stopword = dict() for i in range(len(words_list)): if (f_score[i] <= 1.0 or p_val[i] > 0.05): stopword[words_list[i]] = 1 f = open(stopWord_file, 'wb') pickle.dump(stopword, f) f.close() return stopWord_file
def get_relevance(feat, y_class, relevance_func='mutual_info'): from sklearn.feature_selection import \ chi2, f_classif, mutual_info_classif from scipy.stats import kruskal feat = np.array(feat) if isinstance(relevance_func, str): if relevance_func == 'f_classif': relevance, _ = f_classif(feat, y_class) elif relevance_func == 'chi2': relevance, _ = chi2(feat, y_class) elif relevance_func == 'mutual_info': relevance = mutual_info_classif(feat, y_class) elif relevance_func == 'kruskal': relevance = np.zeros(feat.shape[1]) for i, ft in enumerate(feat.T): try: relevance[i], _ = kruskal( *[ft[y_class == iy] for iy in np.unique(y_class)]) except: relevance[i] = np.nan else: feat = np.array(feat) relevance = np.zeros(feat.shape[1]) for i in range(feat.shape[1]): relevance[i] = relevance_func(feat[:, i], y_class) return relevance
def two_way(self): # multiply all pairs and add result to matrix as new features. sz = self.sz if self.tWay == True: if self.fit == True: for i in range(0, sz - 1): for j in range(i + 1, sz): #print i,j,sz,'2-WAY ---- SEBO SHA3\'AAAAL' newCol = np.multiply(self.numData[:, i], self.numData[:, j]) newCol[newCol == -0] = 0 temp = np.zeros((newCol.shape[0], 1)) for m in range(newCol.shape[0]): temp[m] = newCol[m] temp = np.matrix(temp) f, _ = f_classif(temp, self.target) if f[0] >= 1: self.two_way_list.append((i, j)) self.numData = np.column_stack( (self.numData, newCol)) else: for i in range(len(self.two_way_list)): newCol = np.multiply( self.numData[:, self.two_way_list[i][0]], self.numData[:, self.two_way_list[i][1]]) self.numData = np.column_stack((self.numData, newCol))
def independance_feature(self): res = np.zeros((len(self.table_of_truth.features_name), len(self.table_of_truth.features_name))) for i, _ in enumerate(tqdm(self.table_of_truth.features_name)): if i < len(self.table_of_truth.features_name) - 1: feature_name_ici = str(self.table_of_truth.features_name[i]) X = self.df.drop(feature_name_ici, axis=1) y = self.df[feature_name_ici] chi_scores = f_classif(X, y) p_values = pd.Series(chi_scores[1], index=X.columns) p_values.sort_values(ascending=False, inplace=True) for index_index, index_v in enumerate(p_values.index): index_v_new = self.table_of_truth.features_name.index( index_v) res[i, int(index_v_new)] = p_values.values[index_index] del X, y if len(self.df.columns) > 1: self.df = self.df.drop(feature_name_ici, axis=1) self.res2 = res + res.T df2 = pd.DataFrame(self.res2, index=self.table_of_truth.features_name, columns=self.table_of_truth.features_name) """ vals = np.around(df2.values, 2) colours = plt.cm.RdBu(vals) fig = plt.figure(figsize=(100, 100)) fig.add_subplot(111, frameon=True, xticks=[], yticks=[]) plt.table(cellText=vals, rowLabels=df2.index, colLabels=df2.columns, colWidths=[0.03] * vals.shape[1], loc='center', cellColours=colours) plt.savefig(self.path_save_model + "COMPARASION INTRA FEATURES XI 2.png") """ df2.to_csv(self.path_save_model + "COMPARASION INTRA FEATURES XI 2.csv")
def feature_removal(model,X,y,n_fold=3,maxiter=10,verbose=True,seed=6): # initial benchmark scoretr, scorecv, impf = kf_score_impf(model,X,y,n_fold,mask=None,seed=seed) cvscore_to_beat = np.mean(scorecv) for i in range(maxiter): # feature importances impf = pd.Series(np.mean(impf,axis=0),X.columns); impf.sort() # feature p-values pval = pd.Series(f_classif(X,y)[0],X.columns); pval.sort() # select candidates for both methodologies and score removing that feature impf_candidate, pval_candidate = impf.index[0], pval.index[0] scoretr_impf, scorecv_impf, impf_impf = kf_score_impf(model,X.drop(impf_candidate,1),y,n_fold,mask=None,seed=seed) scoretr_pval, scorecv_pval, impf_pval = kf_score_impf(model,X.drop(pval_candidate,1),y,n_fold,mask=None,seed=seed) scorecv_impf, scorecv_pval = np.mean(scorecv_impf), np.mean(scorecv_pval) best_cvscore = max(scorecv_impf, scorecv_pval) if (best_cvscore - cvscore_to_beat) < 0.0005: break else: use_impf = True if best_cvscore == scorecv_impf else False candidate = impf_candidate if use_impf else pval_candidate if verbose: print "removing '%s' | previous %.4f | new %.4f | improvement %.4f" % ( candidate, cvscore_to_beat, best_cvscore, best_cvscore - cvscore_to_beat) cvscore_to_beat = best_cvscore X = X.drop(candidate,1) return X.columns
def anova_feature_selection(features, targets, pval_threshold=0.05, debug=False): selected_features = {} for label in targets.columns: ## select rows where target is not null mask = targets[label].notna().values y = targets.loc[mask, label] X = features.loc[mask, :].copy() ## ANOVA feature-selection f, pval = f_classif(X, y) f = pd.Series(f).replace([np.inf, -np.inf], np.nan) pval[f.isna()] = np.nan ## select features with p-val< pval_threshold selected_features[label] = X.columns[pval < pval_threshold] if debug: print('target=', label) print('features dimension=', X.shape) print('#selected features=', selected_features[label].size) print('#' * 40) return selected_features
def test_randomized_logistic_sparse(): # Check randomized sparse logistic regression on sparse data iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] # center here because sparse matrices are usually not centered # labels should not be centered X, _, _, _, _ = _preprocess_data(X, y, True, True) X_sp = sparse.csr_matrix(X) F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores_sp = clf.fit(X_sp, y).scores_ assert_array_equal(feature_scores, feature_scores_sp)
def _binning(self, X, y=None): num_windows_per_inst = math.ceil(self.series_length / self.window_size) dft = np.array([ self._mcb_dft(X[i, :], num_windows_per_inst) for i in range(self.n_instances) ]) dft = dft.reshape(len(X) * num_windows_per_inst, self.dft_length) if y is not None: y = np.repeat(y, num_windows_per_inst) if self.anova and y is not None: non_constant = np.where(~np.isclose(dft.var( axis=0), np.zeros_like(dft.shape[1])))[0] # select word-length many indices with best f-score if self.word_length <= non_constant.size: _, p = f_classif(dft[:, non_constant], y) self.support = non_constant[np.argsort(p)][:self.word_length] # sort remaining indices self.support = np.sort(self.support) # select the Fourier coefficients with highest f-score dft = dft[:, self.support] self.dft_length = np.max(self.support) + 1 self.dft_length = self.dft_length + self.dft_length % 2 # even if self.binning_method == "information-gain": return self._igb(dft, y) else: return self._mcb(dft)
def find_pval(feat, tar): anova = f_classif(feat, ravel(tar)) feat_anova_df = DataFrame([{ 'f_stat': f, 'p_val': p } for f, p in zip(anova[0], anova[1])]) return feat_anova_df['p_val']
def filter_methods_classification(X, y, feat_names, rotation=False): angle = 0 if rotation: angle = 90 # do calculations f_test, _ = f_classif(X, y) f_test /= np.max(f_test) mi = mutual_info_classif(X, y) mi /= np.max(mi) # do some plotting plt.figure(figsize=(20, 4)) plt.subplot(1, 2, 1) plt.bar(range(X.shape[1]), f_test, align="center") plt.xticks(range(X.shape[1]), feat_names, rotation=angle) plt.xlabel('features') plt.ylabel('Ranking') plt.title('$F-test$ score') plt.subplot(1, 2, 2) plt.bar(range(X.shape[1]), mi, align="center") plt.xticks(range(X.shape[1]), feat_names, rotation=angle) plt.xlabel('features') plt.ylabel('Ranking') plt.title('Mutual information score') plt.show()
def fScore(self): """ Univariate feature selection with F-test for feature scoring Compute the 1-way ANOVA F-value for the provided sample. The null hypothesis (H0) is that both attributes have the same population mean. This tests whether or not all the different classes of Y have the same mean as X See: https://en.wikipedia.org/wiki/F-test#One-way_ANOVA_example Parameters ---------- X : (N,) array_like The set of regressors that will tested sequentially y : (N,) array_like Input Returns ------- f_score : list of floats The computed F-value of the test. fpval_score : list of floats The associated p-value from the F-distribution. """ print "Compute F-test stats..." + str(time.now()) score = f_classif(self.X, self.y) # F values of features. The higher the score, the more probably the variables are associated f_score = [0 if np.isnan(s) or np.isinf(s) else s for s in score[0] ] # P-values of F-scores. fpval_score= [1 if np.isnan(s) or np.isinf(s) else s for s in score[1] ] return f_score, fpval_score
def fit(self, x, y): """Fit a pipeline with the given data x and labels y Args: x (array-like tensor): input data, shape (n_samples, I_1, I_2, ..., I_N) y (array-like): data labels, shape (n_samples, ) Returns: self """ # fit mpca self.mpca.fit(x) self.mpca.set_params(**{"return_vector": True}) x_transformed = self.mpca.transform(x) # feature selection if self.n_features is None: self.n_features = x_transformed.shape[1] self.feature_order = self.mpca.idx_order else: f_score, p_val = f_classif(x_transformed, y) self.feature_order = (-1 * f_score).argsort() x_transformed = x_transformed[:, self.feature_order][:, : self.n_features] # fit classifier if self.auto_classifier_param: self.grid_search.param_grid["C"].append(1 / x.shape[0]) self.grid_search.fit(x_transformed, y) self.clf = self.grid_search.best_estimator_ if self.classifier == "svc": self.clf.set_params(**{"probability": True}) self.clf.fit(x_transformed, y)
def test_randomized_logistic(): """Check randomized sparse logistic regression""" iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_array_equal(np.argsort(F), np.argsort(feature_scores)) clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5], random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_array_equal(np.argsort(F), np.argsort(feature_scores))
def test_f_classif_multi_class(): """ Test whether the F test yields meaningful results on a simple simulated classification problem """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) F, pv = f_classif(X, Y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.0e-5).all()
def feature_importance_anova(X, y, threshold=0.001, correcting_multiple_hypotesis=True, method='fdr_bh', alpha=0.1, sort_by='pval'): ''' Provide signifance for features in dataset with anova using multiple hypostesis testing :X: List of dict with key as feature names and values as features :y: Labels :threshold: Low-variens threshold to eliminate low varience features :correcting_multiple_hypotesis: corrects p-val with multiple hypotesis testing :method: method of multiple hypotesis testing :alpha: alpha of multiple hypotesis testing :sort_by: sorts output dataframe by pval or F :return: DataFrame with F and pval for each feature with their average values ''' df = variance_threshold_on_df(pd.DataFrame.from_records(X), threshold=threshold) F, pvals = f_classif(df.values, y) if correcting_multiple_hypotesis: _, pvals, _, _ = multipletests(pvals, alpha=alpha, method=method) df['labels'] = y df_mean = df.groupby('labels').mean().T df_mean['F'] = F df_mean['pval'] = pvals return df_mean.sort_values(sort_by, ascending=True)
def feature_extraction(x,y): n_features = x.shape[-1] scores = {} # using p-value to evaluate features scores['p-value'], _ = f_classif(x, y) # using Logistic Regression to evaluate features scaleX = scale(x, copy=True) clf = LogisticRegression(penalty='l1').fit(scaleX, y) scores['LogReg'] = clf.coef_[0] # using Lasso to evaluate features clf = Lasso(0.005).fit(scaleX, y) scores['Lasso'] = clf.coef_ # using LinearSVC clf = LinearSVC(penalty='l1', dual=False).fit(scaleX, y) scores['svc'] = clf.coef_[0] # using ensemble tree clf = ExtraTreesClassifier().fit(x, y) scores['tree'] = clf.feature_importances_ feature_list = {} for tittle, score in scores.items(): this_list = score.argsort() feature_list[tittle] = this_list[0:40] return feature_list
def compute_scoring_func(self, func): if func == 'variance': features = self.instances.features.get_values() annotations = self.instances.annotations.get_labels() return features.var(axis=0), None features = self.annotated_instances.features.get_values() annotations = self.annotated_instances.annotations.get_labels() if func == 'f_classif': return f_classif(features, annotations) elif func == 'mutual_info_classif': features_types = self.instances.features.info.types discrete_indexes = [ i for i, t in enumerate(features_types) if t == FeatureType.binary ] if not discrete_indexes: discrete_indexes = False return (mutual_info_classif(features, annotations, discrete_features=discrete_indexes), None) elif func == 'chi2': return chi2(features, annotations) else: assert (False)
def thresholds(): for name in ['ant', 'ivy', 'jedit', 'lucene', 'poi']: print("##", name) train, test = explore(dir='../Data/Jureczko/', name=name) data_DF=csv2DF(train, toBin=True) metrics=[str[1:] for str in data_DF[data_DF.columns[:-1]]] ubr = LogisticRegression() X = data_DF[data_DF.columns[:-1]].values y = data_DF[data_DF.columns[-1]].values ubr.fit(X,y) inter, coef, pVal = ubr.intercept_[0], ubr.coef_[0], f_classif(X,y)[1] table= texttable.Texttable() table.set_cols_align(["l","l","l"]) table.set_cols_valign(["m","m","m"]) table.set_cols_dtype(['t', 't', 't']) table_rows=[["Metric", "Threshold", "P-Value"]] for i in xrange(len(metrics)): if VARL(coef[i], inter, p0=0.05)>0 and pVal[i]<0.05: thresh="%0.2f"%VARL(coef[i], inter, p0=0.1) table_rows.append([metrics[i], thresh, "%0.3f"%pVal[i]]) table.add_rows(table_rows) print(table.draw()) # === DEBUG === set_trace() return None
def feature_selection(data, labels, n_feat): f,prob = f_classif(data, labels) k_best = SelectKBest(f_classif,k=n_feat) k_best.fit(data,labels) X_new = k_best.transform(data) features = k_best.get_support(indices=True) return f, X_new, features
def infoGain(X,y): X = np.array(X) gains = [] featureNum = X.shape[1] for f in range(featureNum): gains.append(featureInfoGain(X,y,f)) fValues, pValues = f_classif(X, y) return gains, pValues
def feature_selection(X, y): var_imp = f_classif(X, y)[1] var_imp[np.isnan(var_imp)] = 1 imp_feature_idx = var_imp.argsort()[::-1] print('Important feature indices: %s'%(imp_feature_idx)) return var_imp
def feature_selection(self,mode='F'): print 'Feature Selection...' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') X=self.train.copy() y=self.train_label['label'].values.copy() test=self.test.copy() if mode.upper()=='M': mi=mutual_info_classif(train.values,train_label['label'].values) elif mode.upper()=='F': F,pval=f_classif(train.values,train_label['label'].values) elif mode.upper()=='C': chi,pval=chi2(train.values,train_label['label'].values) features=self.train.columns.copy() fs_features=features.copy().tolist() if mode.upper()=='M': fs_V=mi.copy().tolist() elif mode.upper()=='F': fs_V=F.copy().tolist() elif mode.upper()=='C': fs_V=chi.copy().tolist() if mode.upper()=='M': selector=SelectPercentile(mutual_info_classif,percentile=80) elif mode.upper()=='F': selector=SelectPercentile(f_classif,percentile=80) elif mode.upper()=='C': selector=SelectPercentile(chi2,percentile=80) X_new=selector.fit_transform(X,y) selected=selector.get_support() for i in xrange(len(features)): if selected[i]==False: t=features[i] fs_features.remove(t) fs_V=np.array(fs_V) fs_features=np.array(fs_features) self.train=pd.DataFrame(X_new,columns=fs_features.tolist()) self.test=test[fs_features] self.fs_features=fs_features feas=pd.DataFrame() feas['feature']=fs_features print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return X_new,feas
def feature_selection(holeID): import pandas as pd import numpy as np import matplotlib.pyplot as plt import sklearn.preprocessing as pre from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import f_classif cleaned = pd.read_csv('dats/%s_cleandata.csv'%holeID) # cleaned = pd.read_csv('dats/all_data.csv') cleaned.set_index('DEPTH', inplace=True) target = np.logical_not(cleaned.LABELS.isnull()) cols = cleaned.columns.tolist() # cols.remove('Unnamed: 0') cols.remove('LABELS') cols.remove('LABELS_ROCK_TYPE') cleaned = cleaned[cols] # normalise column by col cleaned = (cleaned - cleaned.mean()) / (cleaned.max() - cleaned.min()) shit = [] for col in cols: if cleaned[col].isnull().sum() == len(cleaned): # find column full of nans # print col shit.append(col) non_empty_cols = list(set(cols).difference(set(shit))) # cleaned.fillna(0) cols = non_empty_cols X, y = cleaned[cols], target imputer = pre.Imputer(missing_values='NaN', strategy='mean') X = imputer.fit_transform(X) # blah, pval = chi2(X, y) # x can't have negative values blah, pval = f_classif(X,y) useful_feat = [] for i, feat in enumerate(cols): # if scores[i] == float('inf'): if pval[i] == 0: print feat, pval[i] useful_feat.append(feat) return useful_feat
def compute_f_value(x_data, y_data): F, pval = f_classif(x_data, y_data) print(F, pval) # train_data, validation_data, test_data, basic_users_info = get_data() # label_encoder = {} # train_x, train_y = get_exclude_ndf_x(train_data, basic_users_info, label_encoder) # remove_features_with_low_variance(train_x) # compute_f_value(train_x, train_y)
def test_f_classif(self): diabetes = datasets.load_diabetes() df = pdml.ModelFrame(diabetes) result = df.feature_selection.f_classif() expected = fs.f_classif(diabetes.data, diabetes.target) self.assertEqual(len(result), 2) tm.assert_numpy_array_equal(result[0], expected[0]) tm.assert_numpy_array_equal(result[1], expected[1])
def evalANOVA(individual,targets,toolbox): # Transform the tree expression in a callable function func = toolbox.compile(expr=individual) if(np.logical_not(all(np.isfinite(func)))): return 0.0, #this returns the p-value but we use 1-x so that greater values are better #we have to use reshape(-1,1) because scikit learn needs arrays in the form [[0],[1.34],..etc.] score=1-f_classif(func.reshape(-1,1),targets)[1][0] if np.isnan(score): score=0.0 return score,
def myselect_significant_features(entity, features, pred_varname, random_varname): from sklearn import feature_selection import pandas as pd import pprint pp = pprint.PrettyPrinter(indent=4) import copy if isinstance(entity, pd.SparseDataFrame): feat_f_scores, feat_p_vals = feature_selection.f_classif(entity[features].to_dense().values ,entity[pred_varname].to_dense().values) else: feat_f_scores, feat_p_vals = feature_selection.f_classif(entity[features].values ,entity[pred_varname].values) features_series = pd.Series(data=feat_p_vals, index=features) features_series = features_series.order() print " feature p-values:" pp.pprint(features_series) features = list(features_series[features_series <= 0.05].index) return features
def anova_filter(_df, features): ret_val = set() X = _df[features].values Y = _df.Vote.values v = f_classif(X, Y)[1] i = 0 for c in features: if v[i] < alpha: print c + " selected by anova with p-value: " + str(v[i]) ret_val.add(c) i += 1 return ret_val
def recursiveRanking(X, y): X = np.array(X) featuresNum = X.shape[1] clf = DecisionTreeClassifier(criterion='entropy') ranks = [] ranksSet = set() tmp = X dic={} for i in range(featuresNum): clf.fit(tmp,y) importances = clf.tree_.compute_feature_importances() best = np.argmax(importances) if not best in ranksSet: ranks.append(best) ranksSet.add(best) tmp[:,best] = np.zeros(tmp.shape[0]) dic[best] = i """ if best+1 < tmp.shape[1]: tmp = np.concatenate((tmp[:,:best], tmp[:, best+1:]),axis=1) else: tmp = tmp[:,:best] """ """ new = ranks[:1] for i,r in enumerate(ranks): if i==0: continue left = ranks[:i] m = min(left) while m<=r and len(left)>0: r+=1 del left[np.argmin(left)] if len(left)==0: break m = min(left) new.append(r) ranks = new """ new = np.ones(featuresNum) for f in range(featuresNum): if f in dic: r = dic[f] val = 1.0/(r+1) #print f, r, val else: val=0 #print f, val new[f] = val fValues, pValues = f_classif(X, y) return new, pValues
def levene_f_test(self, data): # For each feature and each class, calculate the mean per class feature_columns = data.columns[:-1] unique_categories = np.unique(data['cat']) mean_per_feature_and_class = {} for feature in feature_columns: feature_mean_per_class = {} for category in unique_categories: data_feature_cat = data[(data.cat == category)][feature] feature_mean_per_class[category] = float(sum(data_feature_cat)/len(data_feature_cat)) mean_per_feature_and_class[feature] = feature_mean_per_class # Then tranform all the data (sample_point - mean) for feature in feature_columns: data[feature] = data[[feature, 'cat']].apply((lambda x: abs(x[0] - mean_per_feature_and_class[feature][x[1]])), axis=1) return f_classif(data[feature_columns], np.ravel(data['cat']))
def top_error_terms(self, truths, preds, X, data): print('\n\nERROR ANALYSIS:\n') for label in self.clf.classes_: print('\nincorrectly labeled %s' % label) iserror = np.zeros(len(truths)) ind = [i for i, (t, p) in enumerate(zip(truths, preds)) if t != p and p == label] iserror[ind] = 1 corrs, _ = f_classif(X, iserror) pos_mask, pos_counts, neg_counts = self.get_pos_mask(X, iserror) corrs *= pos_mask for fidx in np.argsort(corrs)[::-1][:5]: print('\n\t%s (%d incorrect, %d correct)' % (self.vectorizer.features[fidx], pos_counts[fidx], neg_counts[fidx])) matches = [] for midx in range(X.shape[0]): if X[midx, fidx] > 0 and iserror[midx] == 1: matches.append(midx) for m in matches[:3]: print('\t\t' + str(self.vectorizer.extract_features(data[m])))
def f_score(X, y): """ This function implements the anova f_value feature selection (existing method for classification in scikit-learn), where f_score = sum((ni/(c-1))*(mean_i - mean)^2)/((1/(n - c))*sum((ni-1)*std_i^2)) Input ----- X: {numpy array}, shape (n_samples, n_features) input data y : {numpy array},shape (n_samples,) input class labels Output ------ F: {numpy array}, shape (n_features,) f-score for each feature """ F, pval = f_classif(X, y) return F
def test_randomized_logistic(): """Check randomized sparse logistic regression""" iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_equal(np.argsort(F), np.argsort(feature_scores)) clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5], random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_equal(np.argsort(F), np.argsort(feature_scores))
def select_sized_features(feature_size,fearure_vecotr,feature_indecies,y,feature_selection_measure): if feature_selection_measure == SelectionMeasure.chi_2: feature_values,p_value = chi2(fearure_vecotr,y) elif feature_selection_measure == SelectionMeasure.f: feature_values,p_value = f_classif(fearure_vecotr,y) else: # elif feature_selection_measure == SelectionMeasure.mutual_info: feature_values = mutual_info_classif(fearure_vecotr,y) feature_value_id_map = {} for i in range(len(feature_values)): feature_value_id_map[ feature_indecies[i] ] = feature_values[i] sorted_features = sorted(feature_value_id_map.items(),key=lambda x:x[1],reverse=True) selected_features = [] for i in range(feature_size): if i >= len(sorted_features): continue selected_features.append( sorted_features[i][0] ) return selected_features
def get_reliable_features(X, y): """ Purpose: get the model features related to the labels from highest f-score to lowest f-score for classification Inputs: X: dataframe consisting of values for the different features y: dataframe consisting of the labels for each feature vector Output: a dataframe of f, p-values, and cohen's d values, sorted from highest f-value to lowest """ f, pval = f_classif(X, y) #Do f_classif feat_pval = pd.Series(data = pval < (0.05 / len(X.columns)), index = X.columns) #pvals with Bonferroni correction feat_f = pd.Series(data = f, index = X.columns) #f values d = [] for feat in X.columns: pooled_var = X.loc[y[y == 0].index][feat].var() + X.loc[y[y == 1].index][feat].var() mean_diff = X.loc[y[y == 0].index][feat].mean() - X.loc[y[y == 1].index][feat].mean() d.append(np.abs(mean_diff / np.sqrt(pooled_var))) cohen_d = pd.Series(data = d, index = X.columns) df = pd.DataFrame() df['f_score'] = feat_f df['pval'] = feat_pval df['cohens_d'] = cohen_d df.sort_values('f_score', ascending=False, inplace=True) #Sort by the f values return df
def select_bestwords(D, y, nmax = 100, is_classif=True): """ Select nmax best correleted words in D (list of dicts) with goal = y """ y = np.asarray(y) v = DictVectorizer(sparse=True) try: X = v.fit_transform(D) except ValueError: logger.warning("===Except*** in select_bestwords D:%d y:%d",len(D),len(y)) return (set([])) if is_classif: f=f_classif(X,y) else: f=f_regression(X,y) names = v.get_feature_names() # (F-value, p-value, word) a = [(f[0][i], f[1][i], names[i]) for i in range(len(names))] a = sorted([e for e in a if e[1]<0.05], reverse=True) logger.debug("select_bestwords:%s",a[:16]) top = set([ e[2] for e in a[:nmax] ]) return top