def interactive_pipeline(X, Y, pca_n_components, random_forest_n): #remove missing values columns X.dropna(axis=1, inplace=True) # standartize X X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X)) #cutoff by variance variance_threshold = 0.03 variance_cutoff = VarianceThreshold(threshold=variance_threshold) variance_cutoff.fit_transform(X) #cutoff high correlation corr_matrix = X.corr().abs() upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column] > 0.7)] X.drop(X.columns[to_drop], 1, inplace=True) #random forest k_best_features = random_forest_n feature_importance = random_forest_selection.get_feature_importance(X,Y) processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance) #PCA pca = PCA_Obj(X) X = pca.create_pca(pca_n_components) print("X.shape", X.shape) return X, Y #feature_selection_pipeline_from_file()
def feature_selection(features, ideal_num=None): from sklearn.feature_selection import VarianceThreshold copy = np.copy(features) for i in range(8): sel = VarianceThreshold(threshold=(.8 * (1 - .8))) sel.fit_transform(copy[i]) return copy
def recursive_feature_selection(info_humans, info_bots, params, scale=False): X, y, features, scaler = get_Xy(info_humans, info_bots, scale=scale) print "first feature selection by variance test" skb = VarianceThreshold(threshold=(.8 * (1 - .8))) X_new = skb.fit_transform(X) features_1 = features[skb.get_support()] print "second feature selection by ch2 test" skb = SelectKBest(chi2, k=200) # skb = SelectFpr(chi2, alpha=0.005) X_new = skb.fit_transform(X_new, y) features_2 = features_1[skb.get_support()] # skb = PCA(n_components=250) # X_new = skb.fit_transform(X_new, y) print "third feature selection by recursive featue elimination (RFECV)" clf = LogisticRegression(penalty=params['penalty'], C=params['C']) # clf = SVC(kernel="linear") rfecv = RFECV(estimator=clf, step=1, cv=cross_validation.StratifiedKFold(y, 5), scoring='roc_auc', verbose=1) rfecv.fit(X_new, y) print("Optimal number of features : %d" % rfecv.n_features_) return skb, rfecv
def feature_selection_pipeline_from_file(): #get data dataset = refactor_labels(get_data(path, 'Sheet1'), group_column) # all the visualizations auto_visualize_features(dataset.drop(subject_number_column, axis = 1)) #remove missing values columns non_missing_values_treshold = len(dataset.index) * 0.99 dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True) #impute missing values dataset.fillna(dataset.mean(), inplace=True) #set X X = dataset.drop([group_column, subject_number_column], 1) sbj = dataset[subject_number_column] Y = dataset[group_column] names = list(X) # standartize X X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X)) X.columns = names print("p0", X.shape) #cutoff by variance variance_threshold = 0.05 variance_cutoff = VarianceThreshold(threshold=variance_threshold) variance_cutoff.fit_transform(X) print("p1", X.shape) #cutoff high correlation corr_matrix = X.corr().abs() upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column] > 0.7)] X.drop(to_drop, axis = 1, inplace=True) print("p2",X.shape) #random forest k_best_features = 42 feature_importance = random_forest_selection.get_feature_importance(X,Y) random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance, list(X)) processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance) print("p3", processed_dataframe.shape) processed_dataframe.to_csv(processed_dataframe_path) #PCA pca = PCA_Obj(X) pca.explained_variance_graph(pca_explained_variance_graph_path) pca.print_components() n_components = 12 X = pca.create_pca(n_components) pca.save_pca_data(features_after_pca, Y=Y) print("p4", X.shape)
def varianceSelection(self, df, threashold=.8): if not isinstance(df, pandas.core.frame.DataFrame): logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df)) sys.exit(1) sel = VarianceThreshold(threshold=(threashold * (1 - threashold))) sel.fit_transform(df) return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]
def print_variance(self, path=None): df = self.dataset.dropna(axis=1) df = df.drop('group', 1) standard_scaler = preprocessing.MinMaxScaler() data = standard_scaler.fit_transform(df) selector = VarianceThreshold() selector.fit_transform(data) result = sorted(zip(list(df), selector.variances_), key=lambda x: x[1]) print("Variance") print(*result, sep="\n") if path: with open(os.path.join(path, "variance.txt"),"w") as f: for i in result: f.write(str(i)+"\n")
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X, X_rdd = self.generate_dataset() result_local = local.fit_transform(X) result_dist = np.vstack(dist.fit_transform(X_rdd).collect()) assert_array_almost_equal(result_local, result_dist) X, X_rdd = self.generate_sparse_dataset() result_local = local.fit_transform(X) result_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def feature_selection_pipeline_from_file(): #get data dataset = refactor_labels(get_data(path, 'Sheet1'), group_column) # all the visualizations #auto_visualize_features(dataset.drop([subject_number_column], 1)) #remove missing values columns non_missing_values_treshold = len(dataset.index) * 0.99 dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True) #impute missing values dataset.fillna(dataset.mean(), inplace=True) #set X X = dataset.drop([group_column, subject_number_column], 1) sbj = dataset[subject_number_column] Y = dataset[group_column] # standartize X X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X)) #cutoff by variance variance_threshold = 0.03 variance_cutoff = VarianceThreshold(threshold=variance_threshold) variance_cutoff.fit_transform(X) print("p1", X.shape) #cutoff high correlation corr_matrix = X.corr().abs() upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column] > 0.8)] X.drop(X.columns[to_drop], 1, inplace=True) print("p2",X.shape) #save new df processed_dataframe = pd.concat([X, Y, sbj], axis=1) processed_dataframe.to_csv(processed_dataframe_path) #random forest if random_forest: k_best_features = 31 feature_importance = random_forest_selection.get_feature_importance(X,Y) random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance) processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance) processed_dataframe.to_csv(processed_dataframe_path) print("p4", processed_dataframe.shape)
def vectorize_EX(self, columns, variance_thresh=0, train_only=False): print('Start vectorizing') start_time = time.time() hasher = CountVectorizer(binary=True, tokenizer=LemmaTokenizer(), stop_words='english') train_dtm = hasher.fit_transform( self.ga_bm_train[columns].apply(lambda x: ','.join(x), axis=1)) print(hasher.get_feature_names()) print('dtm train shape: ', train_dtm.shape) selector = VarianceThreshold(variance_thresh) train_dtm = selector.fit_transform(train_dtm) print('dtm train shape after variance thresh: ', train_dtm.shape) if not train_only: test_dtm = hasher.transform( self.ga_bm_test[columns].apply(lambda x: ','.join(x), axis=1)) print('dtm test shape: ', test_dtm.shape) test_dtm = selector.transform(test_dtm) print('dtm test shape after variance thresh: ', test_dtm.shape) print("Time: ", round(((time.time() - start_time)/60), 2)) print('Complete vectorizing') if train_only: return train_dtm else: return (train_dtm, test_dtm)
def variance_cutoff(X,cutoff=0.8): """ Set variance cutoff for variables """ sel = VarianceThreshold(threshold=(cutoff * (1 - cutoff))) X = sel.fit_transform(X) return X
def main(): parser = argparse.ArgumentParser(description='Normalize the feature values') required = parser.add_argument_group('required options') required.add_argument('-x', '--outlist', required=True, help='File containing feature values') required.add_argument('-y', '--execlist', required=True, help='File containing exec list') args = parser.parse_args() #X = np.loadtxt(args.outlist, skiprows=1) np.set_printoptions(precision=2) X = np.genfromtxt(args.outlist, skiprows=1) X=np.nan_to_num(X) Y = np.loadtxt(args.execlist, ndmin=2) #f = open("trainlist","wb") #newResult = X/Y #sel = VarianceThreshold(threshold=(.8*(1-.8))) sel = VarianceThreshold(threshold=(.8*(1-.8))) result1 = sel.fit_transform(X) newResult = result1/Y #result2 = sel.fit_transform(newResult) #feature collection for test programs if os.path.isfile('eventlist'): features = np.genfromtxt('eventlist',dtype='str') featureFromVariance = sel.get_support(indices=True) text_file = open("variancefeatures.txt","w") for i in featureFromVariance: text_file.write(features[i]) text_file.write("\n") text_file.close() np.savetxt('normfeaturelist', newResult, fmt='%.2f', delimiter='\t')
def main(): args = getOptions() print args print "train file read" train_x, train_y = readfile_noid(args.train,'train',4) train_x_new, id = extractID(train_x) del train_x train_x_clean, contentdict = cityclean(train_x_new) del id, train_x_new #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_clean) del train_x_clean #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) del train_x_uniq #feature selection and modeling print "feature selection and modeling" exclusivefs(train_x_nor, train_y)
def doFeatureSelection(self,features,target,k): features_int = np.array(features,dtype=float) target_int = np.array(target,dtype=float) sel = VarianceThreshold(threshold=(.8 * (1 - .8))) features_new = sel.fit_transform(features_int) #features_new = SelectKBest(chi2,k=10).fit_transform(features_int,target_int) return features_new
def main(): args = getOptions() print args print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10), scoring='accuracy') rfecv.fit(train_x_nor, train_y) print("Optimal number of features : %d" % rfecv.n_features_)
def main(): args = getOptions() print args fn = "destreeSub.csv" print fn print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) # indices = [i for i in range(len(train_x[0]))] # frqIndex = trimfrq(train_x) # for i in frqIndex: # indices.remove(i) # train_x_uniq = indexTodata(train_x, indices) # test_x_uniq = indexTodata(test_x, indices) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" if args.fts == 'cor': train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor) elif args.fts == 'extraTrees': train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y, test_x_nor) else: train_x_sel = copy.deepcopy(train_x_nor) test_x_sel = copy.deepcopy(test_x_nor) del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq print "modelsing" clf = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0, class_weight='auto') clf.fit(train_x_sel, train_y) train_pdt = clf.predict(train_x_sel) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = clf.predict_proba(test_x_sel) # MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) # print "MCC, Acc_p , Acc_n, Acc_all(test): " # print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open(fn,'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1]))) fout.close()
def main(): args = getOptions() fn = ("submission_cor_%s_%s_%s.csv" % (str(args.lrate).replace('.','dian'),str(args.nest),str(args.maxdepth))) print fn print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) # indices = [i for i in range(len(train_x[0]))] # frqIndex = trimfrq(train_x) # for i in frqIndex: # indices.remove(i) # train_x_uniq = indexTodata(train_x, indices) # test_x_uniq = indexTodata(test_x, indices) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor) # ftsel = correlationSel() # ftsel.dosel(train_x_nor,train_y) # train_x_sel = ftsel.transform(train_x_nor) # test_x_sel = ftsel.transform(test_x_nor) print "modelsing" clf = GradientBoostingClassifier(loss='deviance', learning_rate=args.lrate, n_estimators=args.nest, max_depth=args.maxdepth, verbose=1) clf.fit(train_x_sel, train_y) train_pdt = clf.predict(train_x_sel) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = clf.predict_proba(test_x_sel) # MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) # print "MCC, Acc_p , Acc_n, Acc_all(test): " # print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open(fn,'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1]))) fout.close()
def featureSelectionVarianceThreshold(data, probability = 0.8): dataRaw = data[:, 2:] sel = VarianceThreshold(threshold=(probability*(1 - probability))) dataNew = sel.fit_transform(dataRaw) fd = open('History.txt','a') history = 'Feature Selection: Variance Threshold' + '\n' + 'Selected Feature: ' + str(sel.get_support(True)) + '\n' fd.write(history) fd.close() return np.c_[data[:, :2], dataNew]
def select_centroids(centroids): """ :param centroids: learned centroids :return: new_centroids: (without centroids with variance < avg_variance(centroids)) """ sel = VarianceThreshold(threshold=np.var(centroids)) new_centroids = sel.fit_transform(centroids.T) new_centroids = new_centroids.T return new_centroids
def featureReduction(self, data,threshold_input = 0.99): ''' feature reduction that only keep variables that the variance is greater than threshold. ''' selector = VarianceThreshold(threshold = threshold_input) data = selector.fit_transform(data) print 'Feature Selected with threshold ', threshold_input, data.shape return data
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) result_local = local.fit_transform(X_dense) result_dist = np.vstack(dist.fit_transform(X_dense_rdd).collect()) assert_array_almost_equal(result_local, result_dist) result_local = local.fit_transform(X_sparse) result_dist = sp.vstack(dist.fit_transform(X_sparse_rdd).collect()) assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) result_dist = sp.vstack(dist.fit_transform(Z_rdd)[:, 'X'].collect()) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def conv2matrix(patients, target, sta_type='z-score', sel_type='var', threshold=.8, feature_k=30): ''' extract feature from patient list, all of these features are not real feature, they are preprocessed by expert missing value: set 0 when style is 'z-score', and set nan when style is 'min-max' binary value feature: 0 means negative, and 1 means positive category value feature: e.g. sex should be regarded one value as one featue real value feature: age should be normalized rank value feature: could be set from 1 to n the parameter of type is comprised of 'z-score' and 'min-max' the parameter of threshold is between 0 and 1 return patients_matrix_std: the standardization of patients matrix return features_dict: the dictionary consist of the names and indexes of features will be fed to classifier ''' #process the missing value and the rank value feature for i in range(len(patients)): for key in patients[i]: if patients[i][key] == '': patients[i][key] = 0 #process missing value patients[i][key] = rank2int(patients[i][key]) #process rank value feature #process the category value feature and convert patients' dictionary to matrix vec = DictVectorizer() patients_matrix = vec.fit_transform(patients).toarray() #feature selection if sel_type == 'var': sel = VarianceThreshold(threshold=(threshold * (1 - threshold))) patients_matrix_sel = sel.fit_transform(patients_matrix) elif sel_type == 'uni': sel = SelectKBest(chi2, feature_k) patients_matrix_sel = sel.fit_transform(patients_matrix, target) features_dict = get_features(vec, sel) #print(features_dict) #feature standardization if sta_type == 'z-score': patients_matrix_std = preprocessing.scale(patients_matrix) elif sta_type == 'min-max': min_max_scaler = preprocessing.MinMaxScaler() patients_matrix_std = min_max_scaler.fit_transform(patients_matrix_sel) return patients_matrix_std, features_dict, patients_matrix
def feature_selection_with_scikit(): """ 1-VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples. 2-Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator """ p=0.8 selector = VarianceThreshold(threshold=(p * (1 - p))) c=selector.fit_transform(X) print "Number of the attribute before: ",X.shape[1] print "number of the attribute after:",c.shape[1] # selecting k best attribute instead of chi2, f_classif can also be used skb=SelectKBest(chi2, k=10) X_new=skb.fit_transform(X, y) attr=np.where(skb._get_support_mask(),attributeNames,'-1') print "Best attribute choosen with SelectKBest: " i=1 for att in attr: if att!='-1': print i, ": ",att i+=1 #using ExtraTreesClassifier print "Using feature importance..." etc=ExtraTreesClassifier() etc.fit(X,y).transform(X) print etc.feature_importances_ print etc.max_features print etc.max_depth print "Recursive feature selection : " from sklearn.svm import SVC import sklearn.linear_model as lm from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. estim=lm.LinearRegression() # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=estim, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def run_pca_fct(X_file, data_str): from sklearn.decomposition import PCA import numpy as np import pylab as plt import os from sklearn.feature_selection import VarianceThreshold from sklearn.grid_search import GridSearchCV from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import Imputer X = np.load(X_file) # fixme pipe imp = Imputer() X = imp.fit_transform(X) # fixme? pipe # remove low variance features var_thr = VarianceThreshold() X = var_thr.fit_transform(X) # fixme pipe normalize normalize = MinMaxScaler() X = normalize.fit_transform(X) # fixme? # remove low variance features var_thr = VarianceThreshold() X = var_thr.fit_transform(X) pca = PCA() p = pca.fit_transform(X) explained_var = sum(pca.explained_variance_ratio_) plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_)) plt.title('explained var: %s' % explained_var) plt.xlabel('n_components') plt.tight_layout() out_fig = os.path.join(os.getcwd(), 'pca_var_%s.pdf' % data_str) plt.savefig(out_fig) return out_fig
def feat_selec(tra_val_data, testing_data, thred=0.8): """ Feature selection. """ num_tv = tra_val_data.shape[0] total_data = np.vstack((tra_val_data, testing_data)) selec = VarianceThreshold(threshold=thred) total_selected_data = selec.fit_transform(total_data) return total_selected_data[:num_tv, :], total_selected_data[num_tv:, :]
def FeatureSelection( self ): """Main feature selection method""" if 'Variance' in self.FeatureSelectionMethod: selector = VarianceThreshold(threshold=0.0001) self.Features = selector.fit_transform(self.Features) # pyplot.figure(), pyplot.hist(numpy.var(features, axis = 0), bins = 64), pyplot.show() elif 'Trees' in self.FeatureSelectionMethod: forestFeatures = ExtraTreesClassifier(n_estimators = 512, random_state = 32) forestFeaturesFit = forestFeatures.fit(self.Features, self.Classes) featureImportance = 0.001 featureBool = (forestFeaturesFit.feature_importances_ > featureImportance) self.Features = self.Features[:,featureBool]
def test_variancethreshold_vs_sklearn(): trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS) vt = VarianceThreshold(0.1) vtr = VarianceThresholdR(0.1) y = fs.partial_transform(trajectories[0]) z1 = vt.fit_transform([y])[0] z_ref1 = vtr.fit_transform(y) np.testing.assert_array_almost_equal(z_ref1, z1)
def main(): args = getOptions() print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" ftsel = ExtraTreesClassifier() ftsel.fit(train_x_nor, train_y) # importances = ftsel.feature_importances_ # indices_test = np.argsort(importances)[::-1] # indices_test = indices_test.tolist() train_x_trans = ftsel.transform(train_x_nor) test_x_trans = ftsel.transform(test_x_nor) #modelsing print "modelsing" train = xgb.DMatrix(train_x_trans,label=train_y) test = xgb.DMatrix(test_x_trans,label=test_y) gbm = xgb.train({'max_depth':3, 'n_estimators':1500, 'learning_rate':0.1 ,'objective':'binary:logistic','eval_metric':'auc'},train) train_pdt = gbm.predict(train) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = gbm.predict(test) MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) print "MCC, Acc_p , Acc_n, Acc_all(test): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open("submission_xgbtrain.csv",'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index]))) fout.close()
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) result_local = local.fit_transform(X_dense) result_dist = dist.fit_transform(X_dense_rdd) assert_true(check_rdd_dtype(result_dist, (np.ndarray,))) assert_array_almost_equal(result_local, result_dist.toarray()) result_local = local.fit_transform(X_sparse) result_dist = dist.fit_transform(X_sparse_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) result_dist = dist.fit_transform(Z_rdd)[:, 'X'] assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def getDataRisk(filepath, description, remove2gram): dictByUsers = load_object(filepath, "smoking_1_analytic_data_mapreduce.pkl") transformer = TfidfTransformer(use_idf=False) if dictByUsers[0] is not None: varSelector1 = VarianceThreshold(threshold=0.001) if remove2gram: mattmp0, mattmp1, tmp = rmv2gram(dictByUsers) sparseArrayVarianceFilter1 = varSelector1.fit_transform(mattmp0) sparseArrayVarianceFilter4 = varSelector1.fit_transform(mattmp1) else: sparseArrayVarianceFilter1 = varSelector1.fit_transform(dictByUsers[0]) sparseArrayVarianceFilter4 = varSelector1.fit_transform(dictByUsers[1]) transformer = TfidfTransformer(use_idf=False) sparseArrayRowNorm1 = transformer.fit_transform(sparseArrayVarianceFilter1) sparseArrayRowNorm4 = transformer.fit_transform(sparseArrayVarianceFilter4) y_all=np.array(dictByUsers[4]) sparseArrayRowNorm = [hstack([sparseArrayRowNorm1,sparseArrayRowNorm4],format='csr')] if description: dictByUsers2 = load_object(filepath+"description_data/", "smoking_1_analytic_data_mapreduce.pkl") if dictByUsers2[0] is not None: varSelector1 = VarianceThreshold(threshold=0.001) if remove2gram: mattmp0, mattmp1, tmp = rmv2gram(dictByUsers2) mat1=matchDict(dictByUsers[0], mattmp0, dictByUsers[3], dictByUsers2[3]) mat4=matchDict(dictByUsers[1], mattmp1, dictByUsers[3], dictByUsers2[3]) del mattmp0 del mattmp1 else: mat1=matchDict(dictByUsers[0], dictByUsers2[0], dictByUsers[3], dictByUsers2[3]) mat4=matchDict(dictByUsers[1], dictByUsers2[1], dictByUsers[3], dictByUsers2[3]) sparseArrayVarianceFilter1 = varSelector1.fit_transform(mat1) sparseArrayVarianceFilter4 = varSelector1.fit_transform(mat4) sparseArrayRowNorm1_2=transformer.fit_transform(sparseArrayVarianceFilter1) sparseArrayRowNorm4_2=transformer.fit_transform(sparseArrayVarianceFilter4) sparseArrayRowNorm= sparseArrayRowNorm + [sparseArrayRowNorm1_2,sparseArrayRowNorm4_2] return processYX(y_all, sparseArrayRowNorm)
def test_variancethreshold_vs_sklearn(): dataset = fetch_data() trajectories = dataset["trajectories"] fs = FeatureSelector(FEATS) vt = VarianceThreshold(0.1) vtr = VarianceThresholdR(0.1) y = fs.partial_transform(trajectories[0]) z1 = vt.fit_transform([y])[0] z_ref1 = vtr.fit_transform(y) np.testing.assert_array_almost_equal(z_ref1, z1)
def problem3_3_2(data): selector = VarianceThreshold(threshold=(.8 * (1 - .8))) selector.fit_transform(data) newdata = data.loc[:, selector.get_support()] return newdata.columns, newdata
PDXC.drop_duplicates(keep='last') PDXC = pd.DataFrame.transpose(PDXC) PDXC = PDXC.loc[:,~PDXC.columns.duplicated()] GDSCM = pd.read_csv("GDSC_mutations.Gemcitabine.tsv", sep = "\t", index_col=0, decimal = ",") GDSCM = pd.DataFrame.transpose(GDSCM) GDSCC = pd.read_csv("GDSC_CNV.Gemcitabine.tsv", sep = "\t", index_col=0, decimal = ",") GDSCC.drop_duplicates(keep='last') GDSCC = pd.DataFrame.transpose(GDSCC) selector = VarianceThreshold(0.05) selector.fit_transform(GDSCE) GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]] PDXC = PDXC.fillna(0) PDXC[PDXC != 0.0] = 1 PDXM = PDXM.fillna(0) PDXM[PDXM != 0.0] = 1 GDSCM = GDSCM.fillna(0) GDSCM[GDSCM != 0.0] = 1 GDSCC = GDSCC.fillna(0) GDSCC[GDSCC != 0.0] = 1 ls = GDSCE.columns.intersection(GDSCM.columns) ls = ls.intersection(GDSCC.columns) ls = ls.intersection(PDXE.columns) ls = ls.intersection(PDXM.columns)
from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.feature_selection import VarianceThreshold data = pd.read_csv(open("E:/ML/feature_selection _methods/santander-train.csv", 'rb'), nrows=20000) #data.head X = data.iloc[:, :-1] y = data.iloc[:, -1] c_f = VarianceThreshold(threshold=0.01) X_c_f = c_f.fit_transform(X) X_c_f_T = X_c_f.T X_c_f_T = pd.DataFrame(X_c_f_T) a = X_c_f_T.duplicated().sum() duplicated_features = X_c_f_T.duplicated() features_to_keep = [not index for index in duplicated_features] X_c_f_u = X_c_f_T[features_to_keep].T # ============================================================================= # now calculating roc and auc score
# i += 1 #plt.savefig(graphsDir + 'HFCR Feature Selection - VarianceThreshold') original_data = data.copy() labels_t = [] values = [] count = 0 for t in threshold_list: subDir = graphsDir + 'Threshold = ' + str(t) + '/' if not os.path.exists(subDir): os.makedirs(subDir) labels_t.append(t) sel = VarianceThreshold(threshold=t) sel.fit_transform(original_data.values) f_to_accept = sel.get_support() new_features = [] for i in range(len(f_to_accept)): if f_to_accept[i]: new_features.append(i) print('t = ' + str(t) + ' / n_features = ' + str(len(new_features))) features_file.write('t = ' + str(t) + ": " + str(new_features) + "\n") values.append(len(new_features)) data = original_data.copy()[new_features] variables = data.columns.values eixo_x = 0 eixo_y = 4 eixo_z = 7
#In this section, we will be removing columns that have a low variance uisng VarianceThreshold() # example of apply the variance threshold. Note that our data is full of numerical data. So if we got #a value of 2, it would be reasonable for a categorical variable, but unreasonable for a numerical value! from pandas import read_csv from sklearn.feature_selection import VarianceThreshold # define the location of the dataset path = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/oil-spill.csv' # load the dataset df = read_csv(path, header=None) # split data into inputs and outputs. data = df.values X = data[:, :-1] y = data[:, -1] print(X.shape, y.shape) # Here we are defining the VarianceThreshold transform = VarianceThreshold() # We are applying it the the X variable. We could see that they 1 is removed. X_sel = transform.fit_transform(X) print(X_sel.shape)
min_array = np.append(min_array, v_min) return max_array, min_array files = [i for i in os.listdir("../data/mipas_pd")] files = files[19:24] for file in files: #SVM classifier only PC df_data = pd.read_hdf(os.path.join('../data/mipas_pd', file),'df_btd') new_file = h5py.File("../data/csdb_new/csdb_complete.h5", "r") btd_csdb = new_file["btd_complete"][:] labels = new_file["labels"][:] new_file.close() selector = VarianceThreshold(threshold=100) mipas_var_sel = selector.fit_transform(df_data.iloc[:, 0:10011]) ind = selector.get_support() df_mipas_var_sel = pd.DataFrame(mipas_var_sel) csdb_var_sel = btd_csdb[:, ind] df_csdb_var_sel = pd.DataFrame(csdb_var_sel) var_mipas = df_mipas_var_sel.var() var_csdb = df_csdb_var_sel.var() mean_mipas = df_mipas_var_sel.mean() mean_csdb = df_csdb_var_sel.mean() #instantiate an SVM (with htang) clf = svm.SVC() clf.fit(csdb_var_sel, labels.ravel()) files = [i for i in os.listdir("../data/mipas_pd")]
# -*- coding: utf-8 -*- # 载入数据 from sklearn.datasets import load_iris iris = load_iris() print("iris特征名称\n", iris.feature_names) print("iris特征矩阵\n", iris.data) # 特征选择--方差选择法 from sklearn.feature_selection import VarianceThreshold vt = VarianceThreshold(threshold=1) # threshold为方差的阈值 vt = vt.fit_transform(iris.data) # 函数返回值为特征选择后的数据 print("方差选择法选择的特征\n", vt)
def varianceSelection(X, THRESHOLD=10): from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=THRESHOLD) sel.fit_transform(X) return X[[c for (s, c) in zip(sel.get_support(), X.columns.values) if s]]
def example(): """ ======== 集成方法 ======== """ # bagging 方法 # 使用均匀取样,每个样例的权重相等 平均预测 from sklearn.ensemble import BaggingClassifier from sklearn.neighbors import KNeighborsClassifier bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) # 随机森林 (bagging + dt 随机抽样训练样本) from sklearn.ensemble import RandomForestClassifier X = [[0, 0], [1, 1]] Y = [0, 1] clf = RandomForestClassifier(n_estimators=10) clf = clf.fit(X, Y) # 极端随机森林(每次利用全部样本,训练,但分叉属性的划分值完全随机进行左右分叉) from sklearn.model_selection import cross_val_score from sklearn.datasets import make_blobs from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.tree import DecisionTreeClassifier X, y = make_blobs(n_samples=10000, n_features=10, centers=100, random_state=0) clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0) scores = cross_val_score(clf, X, y, cv=5) print(scores.mean()) clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0) scores = cross_val_score(clf, X, y, cv=5) print(scores.mean()) clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0) scores = cross_val_score(clf, X, y, cv=5) print(scores.mean()) # ####################################################### # boosting 提升方法(通常弱分类器组合)(不可并行) # 选部分数据作为第一次训练集,分错样本+剩余训练数据作为下一次训练集,循环,分类好的分类器权重大 # AdaBoost from sklearn.model_selection import cross_val_score from sklearn.datasets import load_iris from sklearn.ensemble import AdaBoostClassifier iris = load_iris() clf = AdaBoostClassifier(n_estimators=100) scores = cross_val_score(clf, iris.data, iris.target, cv=5) print(scores.mean()) # GradientBoosting 梯度提升(通常弱分类器组合) # 样例:https://www.cnblogs.com/peizhe123/p/5086128.html from sklearn.datasets import make_hastie_10_2 from sklearn.ensemble import GradientBoostingClassifier X, y = make_hastie_10_2(random_state=0) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train) print(clf.score(X_test, y_test)) """ ============= 多标签、多分类 ============= """ # 多标签形式 from sklearn.preprocessing import MultiLabelBinarizer y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]] print(MultiLabelBinarizer().fit_transform(y)) # 利用ovr,ovo多分类 from sklearn import datasets from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsOneClassifier from sklearn.svm import LinearSVC iris = datasets.load_iris() X, y = iris.data, iris.target clf_ovr = OneVsRestClassifier(LinearSVC()) print(clf_ovr.fit(X, y).predict(X)) # 利用ovr可以进行多标签预测 # also supports multilabel classification. To use this feature, # feed the classifier an indicator matrix, in which cell [i, j] indicates the presence of label j in sample i clf_ovo = OneVsOneClassifier(LinearSVC()) print(clf_ovo.fit(X, y).predict(X)) """ ============= 特征选择 ============= """ # 1.方差移除 from sklearn.feature_selection import VarianceThreshold X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]] sel = VarianceThreshold(threshold=(.8 * (1 - .8))) print(sel.fit_transform(X)) # 2.单变量特征选择 # SelectKBest # SelectPercentile # SelectFpr, SelectFdr, SelectFwe from sklearn.datasets import load_iris from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 iris = load_iris() X, y = iris.data, iris.target print(X.shape) X_new = SelectKBest(chi2, k=2).fit_transform(X, y) print(X_new.shape) # scores_和pvalues_ # p值越小,拒绝原假设,原假设:该特征和y不相关 skb = SelectKBest(chi2, k=2).fit(X, y) print(skb.scores_) print(skb.pvalues_) # source function """ For regression: f_regression, mutual_info_regression For classification: chi2, f_classif, mutual_info_classif """ # 3.递归特征消除 from sklearn.svm import SVC from sklearn.datasets import load_digits from sklearn.feature_selection import RFE import matplotlib.pyplot as plt # Load the digits dataset digits = load_digits() X = digits.images.reshape((len(digits.images), -1)) y = digits.target # Create the RFE object and rank each pixel svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=1, step=1) rfe.fit(X, y) ranking = rfe.ranking_.reshape(digits.images[0].shape) # Plot pixel ranking plt.matshow(ranking, cmap=plt.cm.Blues) plt.colorbar() plt.title("Ranking of pixels with RFE") plt.show() # 4.1 SelectFromModel from sklearn.svm import LinearSVC from sklearn.datasets import load_iris from sklearn.feature_selection import SelectFromModel iris = load_iris() X, y = iris.data, iris.target print(X.shape) lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) model = SelectFromModel(lsvc, prefit=True) X_new = model.transform(X) print(X_new.shape) # 4.2 树结构 feature_importances_ from sklearn.ensemble import ExtraTreesClassifier from sklearn.datasets import load_iris from sklearn.feature_selection import SelectFromModel iris = load_iris() X, y = iris.data, iris.target print(X.shape) clf = ExtraTreesClassifier(n_estimators=50) clf = clf.fit(X, y) print(clf.feature_importances_) model = SelectFromModel(clf, prefit=True) X_new = model.transform(X) print(X_new.shape) """ ============= 神经网络 ============= """ # 神经网络,参数讲解 from sklearn.neural_network import MLPClassifier X = [[0., 0.], [1., 1.]] y = [0, 1] clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) print(clf.fit(X, y)) print(clf.predict([[2., 2.], [-1., -2.]])) print(clf.coefs_) print(clf.intercepts_) print(clf.loss_)
for row in readCSV: X.append(list(map(float, row[2:]))) Y.append(float(row[1])) csvfile.close() with open('test_vale.csv') as csvfile: readCSV = csv.reader(csvfile, delimiter=',') next(readCSV, None) # skip the header for row in readCSV: X_test.append(list(map(float, row[1:]))) csvfile.close() print("files loaded") # simple variance based feature selection sel = VarianceThreshold(threshold=(.8 * (1 - .8))) Xselected = sel.fit_transform(X) Xselected_test = sel.fit_transform(X_test) # do SVM with degree d clf = svm.SVC(kernel="poly", degree=3, cache_size=200 ) #linear,poly,rbf,sigmoid, precomputed kernel='poly',degree=3 clf.fit(Xselected, Y) Y_test_predict = clf.predict(Xselected_test) f = open('output_degree' + '.csv', 'w') f.write("Id,y\n") q = 2000 for v in Y_test_predict: st = str(q) + "," + str(int(round(v))) + "\n" #print(st) q += 1
# Display new class counts df_downsampled.click_out.value_counts() df_downsampled.click_out.value_counts() ########rachel copy ends here #determining which columns are part of data and which is the prediction. Removed the user_id and session_id. x = data.loc[:, 'mobile':'duration_sec'] y = data.loc[:, 'click_out'] #number of futures before feature selection len(x) np.size(x, 1) selector = VarianceThreshold(threshold=0.9) x = selector.fit_transform(x) x #split data sets into training and testing x = preprocessing.scale(x) test_size = 0.5 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=1) ##number of fuatures after feature selection len(x_train) np.size(x_train, 1) #choosing a K Value
# GDSC_exprs_B = pd.DataFrame.transpose(GDSC_exprs_B) # GSE55145_exprs_B = pd.DataFrame.transpose(GSE55145_exprs_B) # GSE9782_exprs_B = pd.DataFrame.transpose(GSE9782_exprs_B) GDSC_exprs_z = pd.DataFrame.transpose(GDSC_exprs_z) GSE1_exprs_z = pd.DataFrame.transpose(GSE1_exprs_z) GSE2_exprs_z = pd.DataFrame.transpose(GSE2_exprs_z) GSE3_exprs_z = pd.DataFrame.transpose(GSE3_exprs_z) GSE4_exprs_z = pd.DataFrame.transpose(GSE4_exprs_z) TCGA_exprs_z = pd.DataFrame.transpose(TCGA_exprs_z) # # Remove genes with low signal (i.e. below the variance threshold) from expression data selector = VarianceThreshold(0.05) selector.fit_transform(GDSC_exprs_z) GDSC_exprs_z = GDSC_exprs_z[GDSC_exprs_z.columns[selector.get_support( indices=True)]] ls = GSE1_exprs_z.columns.intersection(GDSC_exprs_z.columns) ls = ls.intersection(GSE2_exprs_z.columns) ls = ls.intersection(GSE3_exprs_z.columns) ls = ls.intersection(GSE4_exprs_z.columns) ls = ls.intersection(TCGA_exprs_z.columns) GSE1_exprs_z = GSE1_exprs_z.loc[:, ls] GSE2_exprs_z = GSE2_exprs_z.loc[:, ls] GSE3_exprs_z = GSE3_exprs_z.loc[:, ls] GSE4_exprs_z = GSE4_exprs_z.loc[:, ls] TCGA_exprs_z = TCGA_exprs_z.loc[:, ls] # Obtain selected genes GDSC_exprs_z_genes = list(GDSC_exprs_z.columns.values)
seq[i] = replacement train_data = np.loadtxt('Train_Data.csv', dtype=np.float32, delimiter=',') train_labels = np.loadtxt('Train_Labels.csv', dtype=np.int32, delimiter=',') test_data = np.loadtxt('Test_Data.csv', dtype=np.float32, delimiter=',') test_labels = np.loadtxt('Test_Labels.csv', dtype=np.int32, delimiter=',') class_names = ['1', '2', '3'] # Feature Selection all_data = np.vstack((train_data,test_data)) all_data_labels=np.hstack((train_labels,test_labels)) sel = VarianceThreshold(threshold=0.90*(1-0.90)) all_data = sel.fit_transform(all_data) all_data_size, _ = all_data.shape _, feature_size = all_data.shape clustering = AffinityPropagation(preference= -1200,damping=0.92).fit(all_data) tmp = clustering.labels_ replace_all(tmp,0,10) replace_all(tmp,1,20) replace_all(tmp,2,30) replace_all(tmp,10,1) replace_all(tmp,20,3)
# COMMAND ---------- # MAGIC %md Create an instance of the `VarianceThreshold` class and store it in an object `selector`. Set threshold to default (`0`). # COMMAND ---------- selector = VarianceThreshold() # COMMAND ---------- # MAGIC %md Fit the transformer class to the sample data array `X` and return a transformed version of `X`. The `fit` method records the variance of each feature from `X`. The `transform` method returns the selected features from `X`. # COMMAND ---------- selector.fit_transform(X) # COMMAND ---------- # MAGIC %md With the default setting for threshold, the two column features with variance above 0 are selected. # COMMAND ---------- # MAGIC %md The transformer class `VarianceThreshold` has attribute `variances_`. Use it to see variances of individual features of `X`. They are the same as in the output of `np.var(X, axis=0)`. # COMMAND ---------- selector.variances_ # COMMAND ----------
# ライブラリをロード from sklearn import datasets from sklearn.feature_selection import VarianceThreshold # テスト用のデータをロード iris = datasets.load_iris() # 特徴量とターゲットを作成 features = iris.data target = iris.target # 閾値を作成 thresholder = VarianceThreshold(threshold=.5) # 分散の大きい特徴量行列を作成 features_high_variance = thresholder.fit_transform(features) # 分散の大きい特徴量行列を表示 features_high_variance[0:3] ########## # 分散を表示 thresholder.fit(features).variances_ ########## # ライブラリをロード from sklearn.preprocessing import StandardScaler # 特徴量行列を標準化
Created on Wed Dec 4 03:54:27 2019 @author: 43884 """ from sklearn.model_selection import train_test_split import pandas as pd from sklearn.feature_selection import VarianceThreshold from imblearn.over_sampling import SMOTE df = pd.read_excel('./processed_data2.xlsx').values x = df[:, :-1] labels = df[:, -1] sel = VarianceThreshold(threshold=0.2) x_transform = sel.fit_transform(x) x_transform.shape sel.get_support(indices = False) x_train, x_rest, y_train, y_rest = train_test_split(df[:, :-1], df[:, -1], test_size = 0.4) x_val, x_test, y_val, y_test = train_test_split(x_rest, y_rest, test_size = 0.5) #before oversampling print(pd.Series(y_train).value_counts()/len(y_train)) smo = SMOTE(random_state=42) x_smo, y_smo = smo.fit_sample(x_train, y_train) #after oversampling print(pd.Series(y_smo).value_counts()/len(y_smo))
print("accuracy for test set is: ", accuracy_score(y_test, xgb_r_test)) xgb_imp_r = xgb_r.feature_importances_ plt.hist(xgb_imp_r, bins='auto') plt.title("Histogram with 'feature_importances'") plt.xscale('log') plt.xlabel("Impotance") plt.ylabel("Counts of Features") plt.savefig('his_feature_importance.png') plt.show() plt.bar(range(len(xgb_imp_r)), xgb_imp_r, 2) plt.yscale('symlog') plt.show() from sklearn.feature_selection import VarianceThreshold # delete features that have same values in all class sel = VarianceThreshold() # can specify: 'threshold=(.8 * (1 - .8))' x = sel.fit_transform(x_train_2) x.shape # Calculate pearson correlation Coefficient pd_train = pd.DataFrame(x_train_2) pd_test = pd.DataFrame(x_test_2) pd_Test = pd.DataFrame(X_test_2) # for final prediction, or 'X_test' pd_Train = pd.DataFrame(X_train_2) # for final prediction, or 'X_train' pd_train_pear = pd_train.corr(method="pearson") plt.figure(figsize=(30, 30)) sb.set(font_scale=0.7) sb.heatmap(abs(pd_train_pear), cmap="YlGn", annot=False) plt.title("A Quick Look at the Correlations among Predictors", fontsize=20) plt.savefig('heatmap_pearson.pdf') plt.show() pd_train_1 = pd_train.copy() pd_test_1 = pd_test.copy()
# Load library from sklearn.feature_selection import VarianceThreshold # Create feature matrix with: # Feature 0: 80% class 0 # Feature 1: 80% class 1 # Feature 2: 60% class 0, 40% class 1 features = [[0, 1, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0]] # Run threshold by variance thresholder = VarianceThreshold(threshold=(.75 * (1 - .75))) thresholder.fit_transform(features)
rus = RandomUnderSampler(random_state=seeds, replacement=True) X_train, y_train = rus.fit_sample(X_train, y_train) radioFeat_train = copy.deepcopy(X_train[:, :1692]) clinical_semanticFeat_train = copy.deepcopy(X_train[:, 1692:]) radioFeat_test = copy.deepcopy(X_test.iloc[:, :1692]) clinical_semanticFeat_test = copy.deepcopy(X_test.iloc[:, 1692:]) print('------------------开始特征选择---------------------') print('radiomics原始特征个数为:{}'.format(radioFeat_train.shape[1])) print('clinical_semantic原始特征个数为:{}'.format( clinical_semanticFeat_train.shape[1])) ##################方差特征选择################ from sklearn.feature_selection import VarianceThreshold # 导入python的相关模块 vad = VarianceThreshold( threshold=0.01) # 表示剔除特征的方差大于阈值的特征Removing features with low variance radioFeat_train = vad.fit_transform(radioFeat_train) # 返回的结果为选择的特征矩阵 radioFeat_test = vad.transform(radioFeat_test) print('train_test_split_seed={} 方差选择radiomics特征个数为:{}'.format( seeds, radioFeat_train.shape[1])) ######################特征归一化到【-1,1】之间##################### # max_abs_scaler = preprocessing.MaxAbsScaler() # max_abs_scaler.fit(xmantrain) # xabstrain = max_abs_scaler.transform(xmantrain) # xabstest = max_abs_scaler.transform(xmantest) ##################方差特征选择################ # from sklearn.feature_selection import VarianceThreshold # 导入python的相关模块 # sel = VarianceThreshold(threshold=0.01) # 表示剔除特征的方差大于阈值的特征Removing features with low variance # ss = sel.fit(xmantrain) # 返回的结果为选择的特征矩阵 # xvartrain = sel.transform(xmantrain) # xvartest = sel.transform(xmantest)
・Filter Method ・Wrapper Method ・Emedded Method Filter Methodは、大別して3つある ・特徴量の値のみ ・特徴量間の相関係数 ・統計的評価指標 # 特徴量のみ ・分散がゼロ => 全て同じ値 => 削減 from sklearn.feature_selection import VarianceThreshold X = desc_df.values select = VarianceThreshold() X_new = select.fit_transform(X) np.array(descs)[select.get_support()==False] # 削減後の特徴量の数を確認 ・分散がほぼゼロ => データをよく観察して削除するか判断 ・特徴量がほかの特徴量と完全に一致 # 特徴量間の相関係数 メリット ・互いに相関の高い特徴量の片方を削除することで、精度にあまり影響を与えずに特徴量空間の次元を下げる ・線形モデルの解釈性を上げることができる。 ピアソン相関係数(いわゆる普通の相関係数)
print('First white-matter anatomy image (3D) is located at: %s' % oasis_dataset.white_matter_maps[0]) # 3D data ############################################################################# # Preprocess data # ---------------- nifti_masker = NiftiMasker(standardize=False, smoothing_fwhm=2, memory='nilearn_cache') # cache options gm_maps_masked = nifti_masker.fit_transform(gm_imgs_train) # The features with too low between-subject variance are removed using # :class:`sklearn.feature_selection.VarianceThreshold`. from sklearn.feature_selection import VarianceThreshold variance_threshold = VarianceThreshold(threshold=.01) gm_maps_thresholded = variance_threshold.fit_transform(gm_maps_masked) gm_maps_masked = variance_threshold.inverse_transform(gm_maps_thresholded) # Then we convert the data back to the mask image in order to use it for # decoding process mask = nifti_masker.inverse_transform(variance_threshold.get_support()) ############################################################################ # Prediction pipeline with ANOVA and SVR using # :class:`nilearn.decoding.DecoderRegressor` Object # In nilearn we can benefit from the built-in DecoderRegressor object to # do ANOVA with SVR instead of manually defining the whole pipeline. # This estimator also uses Cross Validation to select best models and ensemble # them. Furthermore, you can pass n_jobs=<some_high_value> to the # DecoderRegressor class to take advantage of a multi-core system.
def feature_selection_with_plots(X_train, y_train): # initial point print("Current N of features:", len(X_train.columns)) print(" ") # removing features with zero variance print("\033[1m" + "Remove features with zero variance" + "\033[0m") selector = VarianceThreshold() selector.fit_transform(X_train) selected_columns = X_train.columns[(selector.get_support())] print("N of dropped columns:", len(set(X_train.columns) - set(selected_columns))) X_train = X_train[selected_columns] print("Current N of features:", len(X_train.columns)) # Tree-based feature selection clf = ExtraTreesClassifier(n_estimators=50, random_state=333) clf = clf.fit(X_train, y_train) # feature importance feature_importance = clf.feature_importances_.ravel() feature_names = X_train.columns data_tuples = list(zip(feature_names, feature_importance)) features = pd.DataFrame(data_tuples, columns=["feature_names", "feature_importance"]) # plot top n features sorted by feature importance n = 30 fe = features.sort_values(["feature_importance"], ascending=False).reset_index(drop=True) fe = fe.head(n) fe = fe.sort_values(["feature_importance"], ascending=True).reset_index(drop=True) fig = plt.figure(figsize = [12,7]) ax = fig.add_axes([0,0,1,1]) data = fe["feature_importance"].values names = fe["feature_names"].values y_pos = np.arange(len(names)) plt.barh(y_pos, data, color = "darkgreen") plt.yticks(y_pos, names) plt.title("Top "+str(n)+ " features") plt.xlabel("feature importance") plt.ylabel("column name") plt.savefig("figures/Top"+str(n)+ "features.png", bbox_inches = "tight") plt.show() print("\033[1m" + "Tree-based feature selection" + "\033[0m") selector = SelectFromModel(clf, prefit=True) selected_columns = X_train.columns[(selector.get_support())] print("N of dropped columns:", len(set(X_train.columns) - set(selected_columns))) X_train = X_train[selected_columns] print("Current N of features:", len(X_train.columns)) corr = abs(X_train.corr()) plt.figure(figsize=(12,12)) sns.heatmap(corr, square = True) plt.title("Correlation Matrix after tree-based feature selection", fontsize = 15) plt.savefig("figures/cm_after_1stFS.png", bbox_inches = "tight") plt.show() # drop columns highly correlated between each-other and choose the one with higher feature importance print("\033[1m" + "Drop highly correlated features" + "\033[0m") correlations = [] feature_tuples = [] for col in X_train.columns: for row in X_train.columns: correlation = corr.loc[row, col] if row == col: pass elif (col, row) in feature_tuples: pass elif correlation >= 0.7: correlations.append(correlation) feature_tuples.append((row, col)) drop = [] for tup in feature_tuples: f0 = tup[0] f1 = tup[1] imp_f0 = features[features["feature_names"] == f0]["feature_importance"].values imp_f1 = features[features["feature_names"] == f1]["feature_importance"].values if imp_f0 <= imp_f1: drop.append(f0) else: drop.append(f1) drop = set(drop) print("N of dropped features:", len(drop)) selected_columns = list(set(X_train.columns) - set(drop)) X_train = X_train[selected_columns] print("Current N of features:", len(X_train.columns)) corr = abs(X_train.corr()) plt.figure(figsize=(12,12)) sns.heatmap(corr, square = True, annot = True, fmt = ".2") plt.title("Final Correlation Matrix", fontsize = 15) plt.savefig("figures/cm_after_2ndFS.png", bbox_inches = "tight") plt.show() return X_train
def VarianceThreshold_demo(): X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]] sel = VarianceThreshold(threshold=(0.2)) y = sel.fit_transform(X) print(y)
def main(): args = getOptions() print args if args.model == 'gBoosting': fn = ("submissionv4_%s_gBoosting_%s_%s_%s_%s_%s.csv" % (args.fts, args.loss, str(args.minsamplessplit), str( args.lrate).replace('.', 'dian'), str( args.nest), str(args.maxdepth))) elif args.model == 'randomForest': fn = ("submissionv4_%s_randomForest_%s.csv" % (args.fts, args.nest)) print fn print "train file read" train_x, train_y = readfile_noid(args.train, 'train') train_x_new, id = extractID(train_x) train_x_clean, contentdict = cityclean(train_x_new) del id print "test file read" test_x, test_y = readfile_noid(args.test, 'test') test_x_new, id = extractID(test_x) test_x_clean, contentdict = cityclean(test_x_new, contentdict) del contentdict #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_clean) test_x_uniq = sel.transform(test_x_clean) # indices = [i for i in range(len(train_x[0]))] # frqIndex = trimfrq(train_x) # for i in frqIndex: # indices.remove(i) # train_x_uniq = indexTodata(train_x, indices) # test_x_uniq = indexTodata(test_x, indices) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" if args.fts == 'cor': train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor) elif args.fts == 'extraTrees': train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y, test_x_nor) elif args.fts == 'randomTree': train_x_sel, test_x_sel = randomTreesSelect(train_x_nor, train_y, test_x_nor) else: train_x_sel = copy.deepcopy(train_x_nor) test_x_sel = copy.deepcopy(test_x_nor) print len(train_x_nor[0]) print len(train_x_sel[0]) del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq print "modelsing" if args.model == 'gBoosting': clf = GradientBoostingClassifier( loss=args.loss, learning_rate=args.lrate, n_estimators=args.nest, max_depth=args.maxdepth, min_samples_split=args.minsamplessplit, verbose=1) elif args.model == 'randomForest': clf = RandomForestClassifier(n_estimators=args.nest, class_weight='auto') clf.fit(train_x_sel, train_y) train_pdt = clf.predict(train_x_sel) MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all)) test_pdt = clf.predict_proba(test_x_sel) # MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) # print "MCC, Acc_p , Acc_n, Acc_all(test): " # print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout = open(fn, 'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])), str(test_pdt[index][1]))) fout.close()
def var(self, threshold): sel = VarianceThreshold(threshold) self.X = sel.fit_transform(self.x) return self.X
def randomForestClassifier(self, train_cols, test_cols, targets, feature_selction_var, min_abundance_threshold, shuffle=False): """ run random forest classification """ from sklearn.ensemble import RandomForestClassifier #from sklearn.ensemble import RandomForestRegressor #train = self.abundance_df.loc[:,train_cols] #train.as_matrix(cols) train = self.abundance_df[ self.abundance_df['masked'] == False].loc[:, train_cols] #train.as_matrix(cols) #test = self.abundance_df.loc[:,test_cols] #.as_matrix(test_cols) test = self.abundance_df[self.abundance_df['masked'] == False].loc[:, test_cols] #.as_matrix(test_cols) #names = list(self.abundance_df.loc[:, 'species']) names = list(self.abundance_df[self.abundance_df['masked'] == False].loc[:, 'species']) #most_common_species_set = set() #for col in train_cols: # sorted_series = self.abundance_df.loc[:, col].sort_values(ascending=False)[:100] # most_common_species_set |= set(list(sorted_series.index)) #most_common_species_list = [] #for id0 in most_common_species_set: # #print(max(self.abundance_df.loc[id0,train_cols])) # if max(self.abundance_df.loc[id0,train_cols]) >= min_abundance_threshold: # most_common_species_list.append(id0) ##print(len(most_common_species_list)) #most_common_species_set = set(most_common_species_list) #train = train.loc[list(most_common_species_set),:] #test = test.loc[list(most_common_species_set),:] #names = list(self.abundance_df.loc[list(most_common_species_set),'species']) #feature selection by variance from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(0.999 * (1 - 0.999))) if feature_selction_var: #ds1 = np.transpose(ds10.as_matrix()) #ds1 = sel.fit_transform(np.transpose(ds10.as_matrix())) #ds2 = np.transpose(ds20.as_matrix()) #train = sel.fit_transform(np.transpose(train.as_matrix())) train = sel.fit_transform(np.transpose(train.values)) #names = list(self.abundance_df.loc[:, 'species'].as_matrix()[sel.get_support()]) #names = list(self.abundance_df[self.abundance_df['masked']==False].loc[:, 'species'].as_matrix()[sel.get_support()]) names = list( self.abundance_df[self.abundance_df['masked'] == False]. loc[:, 'species'].values[sel.get_support()]) #test = sel.fit_transform(np.transpose(test.as_matrix())) test = sel.fit_transform(np.transpose(test.values)) ds10 = np.asmatrix( train)[[i for i, j in enumerate(targets) if j == 0], :] ds1 = np.transpose(sel.fit_transform(np.transpose(ds10))) else: #train = np.transpose(train.as_matrix()) train = np.transpose(train.values) #test = np.transpose(test.as_matrix()) test = np.transpose(test.values) ds10 = train.iloc[:, [i for i, j in enumerate(targets) if j == 0]] #ds1 = np.transpose(ds10.as_matrix()) ds1 = np.transpose(ds10.values) if shuffle == 'index': from random import shuffle shuffle(names) #rf = RandomForestClassifier(n_estimators=10) target = targets #group1 = list(self.abundance_df.loc[:,train_cols].columns[:target.count(0)]) group1 = list( self.abundance_df[self.abundance_df['masked'] == False].loc[:, train_cols].columns[:target.count(0)]) #group2 = list(self.abundance_df.loc[:,train_cols].columns[target.count(0):]) group2 = list( self.abundance_df[self.abundance_df['masked'] == False].loc[:, train_cols].columns[target.count(0):]) #rf = RandomForestRegressor(n_estimators=1000)#, class_weight="balanced") rf = RandomForestClassifier(n_estimators=1000) # bootstrap=False #, max_features=100)#, min_sample_leaf=50) #rf = RandomForestRegressor(n_estimators=20, max_features=2) #class_weight="balanced" #{class_label: weight} #n_estimators=1000, rf.fit(train, target) #from sklearn.metrics import roc_auc_score #for l in leaf: #model = RandomForestRegressor(min_samples_split=2, max_depth=None, bootstrap=False, min_samples_leaf=2) # #n_estimator=200, oob_score=True, min_samples_leaf=10,max_features=f, #model.fit(train,target) # #print("AUC - ROC : ") # #print(roc_auc_score(target,model.oob_prediction_)) # #print(model.feature_importances_) #from sklearn.ensemble import ExtraTreesClassifier #model = ExtraTreesClassifier() #model.fit(train, target) from treeinterpreter import treeinterpreter as ti prediction, bias, contributions = ti.predict(rf, np.array(train)) #for i in range(len(train)): # j = 0 # # print(i) # #print("\tBias (trainset mean)") # #print(bias[i]) # # print(contributions[0][0]) # #for c, feature in sorted(zip(contributions[i], # # names), # # #self.abundance_df.index), # # key=lambda x: -abs(x[0])): # for c, feature in zip(contributions[i], list(self.abundance_df.index)): # if c[0] != 0: # #print feature, ':\t', "{:.2e}".format(c), '\t', self.abundance_df.loc[feature, 'species'] # if j <10: # # print()'\t' + self.abundance_df.loc[feature, 'species'], '\t', "{:.2e}".format(c[0])) # j += 1 totalc = np.mean(contributions, axis=0) #from sklearn import model_selection #from sklearn.model_selection import cross_val_score #clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0) #scores = cross_val_score(clf, X, y) ##compare 2 groups of samples prediction1, bias1, contributions1 = ti.predict(rf, np.array(ds1)) mean_contri = [0 for i in xrange(len(names))] for s in xrange(len(ds1)): for i in xrange(len(names)): mean_contri[i] += contributions1[s][i][0] mean_contri = [x / len(ds1) for x in mean_contri] names_list = [] #for c, org in sorted(zip(mean_contri, list(self.abundance_df.loc[:,'species'])), reverse=True): for c, org in sorted(zip(mean_contri, names), reverse=True): if c != 0: #print(self.abundance_df.loc[i,group1]) #idx = self.abundance_df[self.abundance_df['species'] == org].index.tolist()[0] idx = self.abundance_df[self.abundance_df['masked'] == False][ self.abundance_df['species'] == org].index.tolist()[0] if shuffle: #print(names.index(org)) #idx = list(self.abundance_df.index)[names.index(org)] idx = list( self.abundance_df[self.abundance_df['masked'] == False].index)[names.index(org)] #maximum = max(self.abundance_df.loc[idx,group1 + group2]) maximum = max(self.abundance_df[self.abundance_df['masked'] == False].loc[idx, group1 + group2]) #print(str(round(c, 3)) + '\t' + org + '\t' + str(round(maximum,3))) names_list.append([round(c, 3), org, round(maximum, 3)]) return names_list
def do_gleason(t_data, filenames, mode): # FEATURE SELECTION # Select K-Best Features, Scale, VarianceThreshold and Select From RF Model kbest = SelectKBest(score_func=chi2, k=15000) scaler = StandardScaler() thresholding = VarianceThreshold() fs_data = [] for i, d in enumerate(t_data): print("\nFILENAME: {}".format(filenames[i])) t_rows = list(d.index) t_columns = d.columns[:-3] # K-best selector = kbest.fit(d.iloc[:, :-3], d.iloc[:, -2]) t_columns = t_columns[selector.get_support()] fs_data.append(pd.DataFrame(selector.transform(t_data[i].iloc[:, :-3]), columns = t_columns, index=t_rows)) if mode == 'show': print("Selecting k best features -\n", fs_data[i].head()) # Scale t_columns = fs_data[i].columns fs_data[i] = pd.DataFrame(scaler.fit_transform(fs_data[i]), columns=t_columns, index=t_rows) if mode == 'show': print("Scaling data -\n", fs_data[i].head()) # Variance Threshold fs_data[i] = pd.DataFrame(thresholding.fit_transform(fs_data[i]), columns=t_columns, index=t_rows) if mode == 'show': print("After variance thresholding -\n", fs_data[i].head()) # Select from RF classifier = RandomForestClassifier(n_estimators=1) classifier = classifier.fit(fs_data[i], d['Gleason']) selector = SelectFromModel(classifier, prefit=True) t_columns = t_columns[selector.get_support()] fs_data[i] = pd.DataFrame(selector.transform(fs_data[i]), columns=t_columns, index=t_rows) fs_data[i]['Gleason'] = d['Gleason'] if mode in ('show'): print("Selecting data from RF model -\n", fs_data[i].head()) print("Shape after feature selection: {}".format(fs_data[i].shape), end="\n\n") # RESAMPLING data - SMOTEENN balanced_data = [[] for _ in range(2)] for i, d in enumerate(fs_data): sme = SMOTEENN(random_state=42, smote=SMOTE(random_state=42, k_neighbors=1)) x, y = sme.fit_resample(fs_data[i], t_data[i]['Gleason']) # x are the features and y are the targets balanced_data[i].append(x) balanced_data[i].append(y) if mode == 'show': print("FILENAME: {}".format(filenames[i]), Counter(balanced_data[i][1])) # DIMENSIONALITY REDUCTION # Kernel PCA and LDA (can be toggled on or off) pca = False pca_dim = 31 lda = True lda_dim = 3 if pca or lda: dr_data = [] for i in range(len(filenames)): print("\nFILENAME: {}".format(filenames[i])) if pca: decomposer = KernelPCA(n_components=pca_dim, kernel='rbf', gamma=0.05, degree=7) dr_data.append(decomposer.fit_transform(balanced_data[i][0])) print("Shape and type after PCA: ", dr_data[i].shape, type(dr_data[i])) else: dr_data.append(balanced_data[i][0]) if lda: decomposer = LinearDiscriminantAnalysis(n_components=lda_dim) dr_data[i] = decomposer.fit_transform(dr_data[i], balanced_data[i][1]) print("Shape and type after LDA: ", dr_data[i].shape, type(dr_data[i])) else: dr_data.append(balanced_data[0][0]) dr_data.append(balanced_data[1][0]) # CLASSIFICATION splits = 10 seed = 7 kfold = KFold(n_splits=splits, random_state=seed, shuffle=True) results = {'SVM': [], 'RF': [], 'KNN': [], 'NB': [] } for i, d in enumerate(dr_data): # SVM res = [] classifier = SVC(gamma='auto') results['SVM'].append(cross_val_score(classifier, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold)) results['SVM'][i] = results['SVM'][i].mean() # RF # rf = RandomForestClassifier(n_estimators=100,n_jobs=-1,max_depth=10,max_features='auto') classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=7, max_features='auto', criterion='gini') #, n_jobs=-1 results['RF'].append(cross_val_score(classifier, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold)) results['RF'][i] = results['RF'][i].mean() # KNN k_scores = [] for n in range(1, 16): knn = KNeighborsClassifier(n_neighbors=3) scores = (cross_val_score(knn, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold)) k_scores.append(scores.mean()) results['KNN'].append(max(k_scores)) # NB nb = GaussianNB() results['NB'].append(cross_val_score(nb, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold)) results['NB'][i] = results['NB'][i].mean() print("\nFinal Results for datasets: {0}, {1} -".format(filenames[0], filenames[1])) pprint(results) # PLOTTING # PCA pca = PCA(n_components = 3) x_pca = pca.fit_transform(balanced_data[0][0]) fig = plt.figure(figsize=(13, 7)) plt.suptitle("3-D plot for resampled data using dimesnionality reduction (Gleason Score)\n\n") ax = fig.add_subplot(121, projection='3d') ax.set_title("PCA\n\n") ax.view_init(elev=177,azim=-96) for i in range(len(balanced_data[0][1])): if balanced_data[0][1][i] == 6: six = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='y', label=balanced_data[0][1][i]) elif balanced_data[0][1][i] == 7: seven = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='g', label=balanced_data[0][1][i]) elif balanced_data[0][1][i] == 8: eight = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='b', label=balanced_data[0][1][i]) elif balanced_data[0][1][i] == 9: nine = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='r', label=balanced_data[0][1][i]) elif balanced_data[0][1][i] == 10: ten = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='m', label=balanced_data[0][1][i]) plt.legend((six, seven, eight, nine, ten), ('6', '7', '8','9','10'), scatterpoints=1, loc='upper right', ncol=1, fontsize=10) # PCA + LDA pca = PCA(n_components = 10) x_pca = pca.fit_transform(balanced_data[0][0]) lda = LinearDiscriminantAnalysis(n_components = 3) x_lda = lda.fit_transform(x_pca, balanced_data[0][1]) ax = fig.add_subplot(122, projection='3d') plt.title("PCA & LDA\n\n") ax.view_init(elev=10,azim=-112) for i in range(len(balanced_data[0][1])): if balanced_data[0][1][i] == 6: six = ax.scatter(x_lda[i][0], x_lda[i][1], x_lda[i][2], c='y', label=balanced_data[0][1][i]) elif balanced_data[0][1][i] == 7: seven = ax.scatter(x_lda[i][0], x_lda[i][1], x_lda[i][2], c='g', label=balanced_data[0][1][i]) elif balanced_data[0][1][i] == 8: eight = ax.scatter(x_lda[i][0], x_lda[i][1], x_lda[i][2], c='b', label=balanced_data[0][1][i]) elif balanced_data[0][1][i] == 9: nine = ax.scatter(x_lda[i][0], x_lda[i][1], x_lda[i][2], c='r', label=balanced_data[0][1][i]) elif balanced_data[0][1][i] == 10: ten = ax.scatter(x_lda[i][0], x_lda[i][1], x_lda[i][2], c='m', label=balanced_data[0][1][i]) plt.legend((six, seven, eight, nine, ten), ('6', '7', '8','9','10'), scatterpoints=1, loc='upper right', ncol=1, fontsize=10) #plt.show() return results
from sklearn.datasets import load_iris import algorithm_1 import numpy as np from sklearn.feature_selection import VarianceThreshold if __name__ == '__main__': from datetime import datetime startTime = datetime.now() data = load_iris() X = data.data y = data.target for kNumOfNeighbors in range(15, 16): weightMat = algorithm_1.buildWeightMat(X, [], kNumOfNeighbors) # print datetime.now() - startTime diagleMat = algorithm_1.buildDiagleMat(weightMat) laplaMat = algorithm_1.buildLaplacianMat(weightMat, diagleMat) featureRowMat = np.transpose(X) listLaplacianScore = [ algorithm_1.computeLaplacianScore(np.transpose([featureVec]), laplaMat, diagleMat) for featureVec in featureRowMat ] algorithm_1.saveSortedLaplaFeatureIndexes(listLaplacianScore, filename=("aaa")) #variance sel = VarianceThreshold(threshold=(.9 * (1 - .9))) new_X = sel.fit_transform(X)
def variance(obs, num_obs): selector = VarianceThreshold() selector.fit_transform(data.transpose()) indices = np.argsort(selector.variances_)[-num_obs:] return indices
fmri_data, df_data = utils.groupby_average(fmri_data, df_data.reset_index(), groupby=['id']) df_data = df_data.reset_index() # something we need for defining the cross validation method BOLD = fmri_data.copy() targets = np.array( [label_map[item] for item in df_data['targets'].values]) groups = df_data['words'].values # to remove the low variant voxels and standardize the BOLD signal from sklearn.feature_selection import VarianceThreshold from sklearn.preprocessing import StandardScaler variance_threshold = VarianceThreshold() BOLD = variance_threshold.fit_transform(BOLD) scaler = StandardScaler() BOLD = scaler.fit_transform(BOLD) # word embedding # convert the words into embedding features for word2vec_vec, word2vec_name in zip(word2vec_vecs, word2vec_names): csv_filename = os.path.join( saving_dir, '{} {} {} {} {} {}.csv'.format(experiment, here, sub_name, roi_name, condition, word2vec_name)) processed = glob(os.path.join(saving_dir, '*.csv')) if csv_filename in processed: # don't repeat what have done print(csv_filename) pass