def variance_threshold(features_train, features_valid): """Return the initial dataframes after dropping some features according to variance threshold Parameters: ---------- features_train: pd.DataFrame features of training set features_valid: pd.DataFrame features of validation set Output: ------ features_train: pd.DataFrame features_valid: pd.DataFrame """ from sklearn.feature_selection import VarianceThreshold threshold=0.01 selector = VarianceThreshold(threshold=threshold) selector.fit(features_train) ## Instead of using the transform() method, we look at which columns have been dropped, to be able to drop in both training and validation set the same features. This way, we keep the column names to make interpretation easier variances = selector.variances_ dropped_features = features_train.columns.values[variances < threshold] #name of features to drop features_train.drop(dropped_features, axis=1, inplace=True) features_valid.drop(dropped_features, axis=1, inplace=True) return features_train, features_valid
def vectorize_EX(self, columns, variance_thresh=0, train_only=False): print('Start vectorizing') start_time = time.time() hasher = CountVectorizer(binary=True, tokenizer=LemmaTokenizer(), stop_words='english') train_dtm = hasher.fit_transform( self.ga_bm_train[columns].apply(lambda x: ','.join(x), axis=1)) print(hasher.get_feature_names()) print('dtm train shape: ', train_dtm.shape) selector = VarianceThreshold(variance_thresh) train_dtm = selector.fit_transform(train_dtm) print('dtm train shape after variance thresh: ', train_dtm.shape) if not train_only: test_dtm = hasher.transform( self.ga_bm_test[columns].apply(lambda x: ','.join(x), axis=1)) print('dtm test shape: ', test_dtm.shape) test_dtm = selector.transform(test_dtm) print('dtm test shape after variance thresh: ', test_dtm.shape) print("Time: ", round(((time.time() - start_time)/60), 2)) print('Complete vectorizing') if train_only: return train_dtm else: return (train_dtm, test_dtm)
def feature_select(word,instance_dic,feature_dic, thre_hold=0.01, num_feature=100): instances_list = instance_dic[word] feature_words=feature_dic[word] feature_xs = [] labels = [] for instance in instances_list: label = ' '.join(instance.senseid) feature_x_dic = feature_vector(instance,feature_words) feature_vals=[] for word in feature_words: feature_vals.append(feature_x_dic[word]) feature_xs.append(feature_vals) labels.append(label) # 1st round feature selection by removing low variance features sel_lowvr = VarianceThreshold(threshold=(thre_hold)) feature_xs_selected = sel_lowvr.fit(feature_xs) lowvr_index = feature_xs_selected.get_support(indices=True).tolist() feature_xs_selected = feature_xs_selected.transform(feature_xs).tolist() # 2nd round feature selection using sklearn's SelectKBest() if num_feature < len(feature_xs_selected[0]): sel_chi2 = SelectKBest(chi2, k= num_feature).fit(feature_xs_selected, labels) chi2_index= sel_chi2.get_support(indices=True).tolist() #feature_xs_selected = sel_chi2.transform(feature_xs_selected).tolist()# transform from numpy array back to lis return lowvr_index, chi2_index else: print str(word) + ": chi2 selection not executed due to low # of features" return lowvr_index, [i for i in range(len(lowvr_index))]
def main(): args = getOptions() print args print "train file read" train_x, train_y = readfile_noid(args.train,'train',4) train_x_new, id = extractID(train_x) del train_x train_x_clean, contentdict = cityclean(train_x_new) del id, train_x_new #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_clean) del train_x_clean #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) del train_x_uniq #feature selection and modeling print "feature selection and modeling" exclusivefs(train_x_nor, train_y)
def interactive_pipeline(X, Y, pca_n_components, random_forest_n): #remove missing values columns X.dropna(axis=1, inplace=True) # standartize X X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X)) #cutoff by variance variance_threshold = 0.03 variance_cutoff = VarianceThreshold(threshold=variance_threshold) variance_cutoff.fit_transform(X) #cutoff high correlation corr_matrix = X.corr().abs() upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column] > 0.7)] X.drop(X.columns[to_drop], 1, inplace=True) #random forest k_best_features = random_forest_n feature_importance = random_forest_selection.get_feature_importance(X,Y) processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance) #PCA pca = PCA_Obj(X) X = pca.create_pca(pca_n_components) print("X.shape", X.shape) return X, Y #feature_selection_pipeline_from_file()
def select_features(x, y, methods=('variance', 'correlation', 'l1', 'forest')): ''' methods = ('variance', 'correlation', 'l1', 'forest') - variance: use variance threshold to discard features that are mostly 0 or 1 - correlation: use chi2 test to remove most very correlated features - l1: use l1 penalty to remove features that make solution sparse - forest: use ExtraTreesClassifier to point out importance of features select important ones ''' features = x.loc[:,'Feature_1':'Feature_2'] if 'variance' in methods: vt = VT(threshold=(0.99*(1-0.99))) vt.fit(features) if 'correlation' in methods: cr = SP(f_regression, percentile=80) if 'l1' in methods: rgr = MultiTaskLassoCV(cv=5, n_jobs=-1) m = SFM(rgr) if 'forest' in methods: clf = RandomRorestRegressor(n_estimators=300, max_features=0.7,n_jobs=-1).fit(x,y) m = SFM(clf) m.fit(x.values, y.values) for indices in idx_list: x_indices = x_indices & indices print 'All: %s' % len(x_indices) return list(x_indices)
def _variance_threshold(self, input_df, threshold): """Uses Scikit-learn's VarianceThreshold feature selection to learn the subset of features that pass the threshold Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to perform feature selection on threshold: float The variance threshold that removes features that fall under the threshold Returns ------- subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']} Returns a DataFrame containing the features that are above the variance threshold """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) selector = VarianceThreshold(threshold=threshold) try: selector.fit(training_features) except ValueError: # None features are above the variance threshold return input_df[['guess', 'class', 'group']].copy() mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group'] return input_df[mask_cols].copy()
def doFeatureSelection(self,features,target,k): features_int = np.array(features,dtype=float) target_int = np.array(target,dtype=float) sel = VarianceThreshold(threshold=(.8 * (1 - .8))) features_new = sel.fit_transform(features_int) #features_new = SelectKBest(chi2,k=10).fit_transform(features_int,target_int) return features_new
def variance_cutoff(X,cutoff=0.8): """ Set variance cutoff for variables """ sel = VarianceThreshold(threshold=(cutoff * (1 - cutoff))) X = sel.fit_transform(X) return X
def test_same_variances(self): local = VarianceThreshold() dist = SparkVarianceThreshold() shapes = [((10, 5), None), ((1e3, 20), None), ((1e3, 20), 100), ((1e4, 100), None), ((1e4, 100), 600)] for shape, block_size in shapes: X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) local.fit(X_dense) dist.fit(X_dense_rdd) assert_array_almost_equal(local.variances_, dist.variances_) local.fit(X_sparse) dist.fit(X_sparse_rdd) assert_array_almost_equal(local.variances_, dist.variances_) dist.fit(Z) assert_array_almost_equal(local.variances_, dist.variances_)
def main(): args = getOptions() print args print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10), scoring='accuracy') rfecv.fit(train_x_nor, train_y) print("Optimal number of features : %d" % rfecv.n_features_)
def main(): parser = argparse.ArgumentParser(description='Normalize the feature values') required = parser.add_argument_group('required options') required.add_argument('-x', '--outlist', required=True, help='File containing feature values') required.add_argument('-y', '--execlist', required=True, help='File containing exec list') args = parser.parse_args() #X = np.loadtxt(args.outlist, skiprows=1) np.set_printoptions(precision=2) X = np.genfromtxt(args.outlist, skiprows=1) X=np.nan_to_num(X) Y = np.loadtxt(args.execlist, ndmin=2) #f = open("trainlist","wb") #newResult = X/Y #sel = VarianceThreshold(threshold=(.8*(1-.8))) sel = VarianceThreshold(threshold=(.8*(1-.8))) result1 = sel.fit_transform(X) newResult = result1/Y #result2 = sel.fit_transform(newResult) #feature collection for test programs if os.path.isfile('eventlist'): features = np.genfromtxt('eventlist',dtype='str') featureFromVariance = sel.get_support(indices=True) text_file = open("variancefeatures.txt","w") for i in featureFromVariance: text_file.write(features[i]) text_file.write("\n") text_file.close() np.savetxt('normfeaturelist', newResult, fmt='%.2f', delimiter='\t')
def remove_feat_constants(data_frame): # Remove feature vectors containing one unique value, # because such features do not have predictive value. print("") print("Deleting zero variance features...") # Let's get the zero variance features by fitting VarianceThreshold # selector to the data, but let's not transform the data with # the selector because it will also transform our Pandas data frame into # NumPy array and we would like to keep the Pandas data frame. Therefore, # let's delete the zero variance features manually. n_features_originally = data_frame.shape[1] selector = VarianceThreshold() selector.fit(data_frame) # Get the indices of zero variance feats feat_ix_keep = selector.get_support(indices=True) orig_feat_ix = np.arange(data_frame.columns.size) feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep) # Delete zero variance feats from the original pandas data frame data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete], axis=1) # Print info n_features_deleted = feat_ix_delete.size print(" - Deleted %s / %s features (~= %.1f %%)" % ( n_features_deleted, n_features_originally, 100.0 * (np.float(n_features_deleted) / n_features_originally))) return data_frame
def feature_selection(features, ideal_num=None): from sklearn.feature_selection import VarianceThreshold copy = np.copy(features) for i in range(8): sel = VarianceThreshold(threshold=(.8 * (1 - .8))) sel.fit_transform(copy[i]) return copy
def feature_selection_pipeline_from_file(): #get data dataset = refactor_labels(get_data(path, 'Sheet1'), group_column) # all the visualizations auto_visualize_features(dataset.drop(subject_number_column, axis = 1)) #remove missing values columns non_missing_values_treshold = len(dataset.index) * 0.99 dataset.dropna(axis=1, thresh=non_missing_values_treshold, inplace=True) #impute missing values dataset.fillna(dataset.mean(), inplace=True) #set X X = dataset.drop([group_column, subject_number_column], 1) sbj = dataset[subject_number_column] Y = dataset[group_column] names = list(X) # standartize X X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X)) X.columns = names print("p0", X.shape) #cutoff by variance variance_threshold = 0.05 variance_cutoff = VarianceThreshold(threshold=variance_threshold) variance_cutoff.fit_transform(X) print("p1", X.shape) #cutoff high correlation corr_matrix = X.corr().abs() upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) to_drop = [column for column in upper.columns if any(upper[column] > 0.7)] X.drop(to_drop, axis = 1, inplace=True) print("p2",X.shape) #random forest k_best_features = 42 feature_importance = random_forest_selection.get_feature_importance(X,Y) random_forest_selection.save_feature_importance(feature_importance_txt_path, feature_importance, list(X)) processed_dataframe, X = random_forest_selection.get_k_most_important_features(X,Y,k_best_features,feature_importance) print("p3", processed_dataframe.shape) processed_dataframe.to_csv(processed_dataframe_path) #PCA pca = PCA_Obj(X) pca.explained_variance_graph(pca_explained_variance_graph_path) pca.print_components() n_components = 12 X = pca.create_pca(n_components) pca.save_pca_data(features_after_pca, Y=Y) print("p4", X.shape)
def varianceSelection(self, df, threashold=.8): if not isinstance(df, pandas.core.frame.DataFrame): logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df)) sys.exit(1) sel = VarianceThreshold(threshold=(threashold * (1 - threashold))) sel.fit_transform(df) return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]
def main(): args = getOptions() fn = ("submission_cor_%s_%s_%s.csv" % (str(args.lrate).replace('.','dian'),str(args.nest),str(args.maxdepth))) print fn print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) # indices = [i for i in range(len(train_x[0]))] # frqIndex = trimfrq(train_x) # for i in frqIndex: # indices.remove(i) # train_x_uniq = indexTodata(train_x, indices) # test_x_uniq = indexTodata(test_x, indices) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor) # ftsel = correlationSel() # ftsel.dosel(train_x_nor,train_y) # train_x_sel = ftsel.transform(train_x_nor) # test_x_sel = ftsel.transform(test_x_nor) print "modelsing" clf = GradientBoostingClassifier(loss='deviance', learning_rate=args.lrate, n_estimators=args.nest, max_depth=args.maxdepth, verbose=1) clf.fit(train_x_sel, train_y) train_pdt = clf.predict(train_x_sel) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = clf.predict_proba(test_x_sel) # MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) # print "MCC, Acc_p , Acc_n, Acc_all(test): " # print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open(fn,'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1]))) fout.close()
def main(): args = getOptions() print args fn = "destreeSub.csv" print fn print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) # indices = [i for i in range(len(train_x[0]))] # frqIndex = trimfrq(train_x) # for i in frqIndex: # indices.remove(i) # train_x_uniq = indexTodata(train_x, indices) # test_x_uniq = indexTodata(test_x, indices) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" if args.fts == 'cor': train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor) elif args.fts == 'extraTrees': train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y, test_x_nor) else: train_x_sel = copy.deepcopy(train_x_nor) test_x_sel = copy.deepcopy(test_x_nor) del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq print "modelsing" clf = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0, class_weight='auto') clf.fit(train_x_sel, train_y) train_pdt = clf.predict(train_x_sel) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = clf.predict_proba(test_x_sel) # MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) # print "MCC, Acc_p , Acc_n, Acc_all(test): " # print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open(fn,'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1]))) fout.close()
def featureSelectionVarianceThreshold(data, probability = 0.8): dataRaw = data[:, 2:] sel = VarianceThreshold(threshold=(probability*(1 - probability))) dataNew = sel.fit_transform(dataRaw) fd = open('History.txt','a') history = 'Feature Selection: Variance Threshold' + '\n' + 'Selected Feature: ' + str(sel.get_support(True)) + '\n' fd.write(history) fd.close() return np.c_[data[:, :2], dataNew]
def test_zero_variance(): """Test VarianceThreshold with default setting, zero variance.""" for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: sel = VarianceThreshold().fit(X) assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True)) assert_raises(ValueError, VarianceThreshold().fit, [0, 1, 2, 3]) assert_raises(ValueError, VarianceThreshold().fit, [[0, 1], [0, 1]])
def select_centroids(centroids): """ :param centroids: learned centroids :return: new_centroids: (without centroids with variance < avg_variance(centroids)) """ sel = VarianceThreshold(threshold=np.var(centroids)) new_centroids = sel.fit_transform(centroids.T) new_centroids = new_centroids.T return new_centroids
def featureReduction(self, data,threshold_input = 0.99): ''' feature reduction that only keep variables that the variance is greater than threshold. ''' selector = VarianceThreshold(threshold = threshold_input) data = selector.fit_transform(data) print 'Feature Selected with threshold ', threshold_input, data.shape return data
def variance_threshold(self, dframe=None, columns=None, skip_columns=None, thresh=0.0, autoremove=False): """ Wrapper for sklearn variance threshold to for pandas dataframe :param dframe: :param columns: :param skip_columns: :param thresh: :param autoremove: :return: """ logging.debug("Finding low-variance features") removed_features=[] try: all_columns = dframe.columns # remove the skip columns remaining_cols = all_columns.drop(skip_columns) # get length of new index. max_index = len(remaining_cols) - 1 skipped_idx = [all_columns.get_loc(column) for column in skip_columns] for idx, item in enumerate(skipped_idx): if item > max_index: diff = item - max_index skipped_idx[idx] -= diff if item == max_index: diff = item - len(skip_columns) skipped_idx[idx] -= diff if idx == 0: skipped_idx[idx] = item skipped_values = dframe.iloc[:skipped_idx].values X = dframe.loc[:, remaining_cols].values vt = VarianceThreshold(threshold=thresh) vt.fit(X) feature_indices = vt.get_support(indices=True) feature_names = [remaining_cols[idx] for idx, _ in enumerate(remaining_cols) if idx in feature_indices] removed_features = list(np.setdiff1d(remaining_cols, feature_names)) logging.debug("Found %d low - variance columns " % len(removed_features)) except Exception as e: logging.error(e) logging.error("Could not remove low variance features, some thing went wrong") print(e) pass return dframe, removed_features
def test_variance_threshold(): tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) selector = VarianceThreshold(threshold=0) selector.fit(training_features) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._variance_threshold(training_testing_data, 0), training_testing_data[mask_cols])
def feature_selection_with_scikit(): """ 1-VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples. 2-Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator """ p=0.8 selector = VarianceThreshold(threshold=(p * (1 - p))) c=selector.fit_transform(X) print "Number of the attribute before: ",X.shape[1] print "number of the attribute after:",c.shape[1] # selecting k best attribute instead of chi2, f_classif can also be used skb=SelectKBest(chi2, k=10) X_new=skb.fit_transform(X, y) attr=np.where(skb._get_support_mask(),attributeNames,'-1') print "Best attribute choosen with SelectKBest: " i=1 for att in attr: if att!='-1': print i, ": ",att i+=1 #using ExtraTreesClassifier print "Using feature importance..." etc=ExtraTreesClassifier() etc.fit(X,y).transform(X) print etc.feature_importances_ print etc.max_features print etc.max_depth print "Recursive feature selection : " from sklearn.svm import SVC import sklearn.linear_model as lm from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. estim=lm.LinearRegression() # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=estim, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def feature_selection(train_instances): logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info('Crossvalidation started... ') selector = VarianceThreshold() selector.fit(train_instances) logger.info('Number of features used... ' + str(Counter(selector.get_support())[True])) logger.info('Number of features ignored... ' + str(Counter(selector.get_support())[False])) return selector
def feat_selec(tra_val_data, testing_data, thred=0.8): """ Feature selection. """ num_tv = tra_val_data.shape[0] total_data = np.vstack((tra_val_data, testing_data)) selec = VarianceThreshold(threshold=thred) total_selected_data = selec.fit_transform(total_data) return total_selected_data[:num_tv, :], total_selected_data[num_tv:, :]
def FeatureSelection( self ): """Main feature selection method""" if 'Variance' in self.FeatureSelectionMethod: selector = VarianceThreshold(threshold=0.0001) self.Features = selector.fit_transform(self.Features) # pyplot.figure(), pyplot.hist(numpy.var(features, axis = 0), bins = 64), pyplot.show() elif 'Trees' in self.FeatureSelectionMethod: forestFeatures = ExtraTreesClassifier(n_estimators = 512, random_state = 32) forestFeaturesFit = forestFeatures.fit(self.Features, self.Classes) featureImportance = 0.001 featureBool = (forestFeaturesFit.feature_importances_ > featureImportance) self.Features = self.Features[:,featureBool]
def test_variancethreshold_vs_sklearn(): trajectories = AlanineDipeptide().get_cached().trajectories fs = FeatureSelector(FEATS) vt = VarianceThreshold(0.1) vtr = VarianceThresholdR(0.1) y = fs.partial_transform(trajectories[0]) z1 = vt.fit_transform([y])[0] z_ref1 = vtr.fit_transform(y) np.testing.assert_array_almost_equal(z_ref1, z1)
def main(): args = getOptions() print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" ftsel = ExtraTreesClassifier() ftsel.fit(train_x_nor, train_y) # importances = ftsel.feature_importances_ # indices_test = np.argsort(importances)[::-1] # indices_test = indices_test.tolist() train_x_trans = ftsel.transform(train_x_nor) test_x_trans = ftsel.transform(test_x_nor) #modelsing print "modelsing" train = xgb.DMatrix(train_x_trans,label=train_y) test = xgb.DMatrix(test_x_trans,label=test_y) gbm = xgb.train({'max_depth':3, 'n_estimators':1500, 'learning_rate':0.1 ,'objective':'binary:logistic','eval_metric':'auc'},train) train_pdt = gbm.predict(train) MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) test_pdt = gbm.predict(test) MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) print "MCC, Acc_p , Acc_n, Acc_all(test): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all)) fout=open("submission_xgbtrain.csv",'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index]))) fout.close()
# In[28]: mutual_info=mutual_info_classif(x,y) mutual_data=pd.Series(mutual_info,index=x.columns) mutual_data.sort_values(ascending=False) # # variance thereshold # it will remove the valur which threshhold value maching # In[29]: from sklearn.feature_selection import VarianceThreshold vt=VarianceThreshold(threshold=1) vt.fit(x) # In[30]: x.columns[vt.get_support()] # In[31]: zero_therhold=[ i for i in x.columns if i not in x.columns[vt.get_support()]]
print(len(dup_list)) train_df = train_df.drop(dup_list, axis=1) print(train_df.shape) test_df = test_df.drop(dup_list, axis=1) print(test_df.shape) X = train_df.iloc[:, 1:-1].values y = train_df['TARGET'].values X_test = test_df.iloc[:, 1:].values print("X shape", X.shape) print("X_test shape", X_test.shape) # remove constant features selector = VarianceThreshold(threshold = 0.001) X = selector.fit_transform(X) X_test = selector.transform(X_test) print("After removing low variance features") print("X shape:", X.shape) print("X_test shape:", X_test.shape) import xgboost as xgb from sklearn.ensemble import BaggingClassifier dtrain = xgb.DMatrix(X, label=y) dtest = xgb.DMatrix(X_test) evallist = [(dtrain,'train')]
from sklearn.feature_selection import VarianceThreshold data = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]] vt = VarianceThreshold() # 默认过滤掉方差为0特征 result = vt.fit_transform(data) print(result)
def test_variance_threshold(): # Test VarianceThreshold with custom variance. for X in [data, csr_matrix(data)]: X = VarianceThreshold(threshold=.4).fit_transform(X) assert (len(data), 1) == X.shape
from sklearn import datasets from sklearn.feature_selection import VarianceThreshold # Load iris data iris = datasets.load_iris() # Create features and target X = iris.data y = iris.target # Create VarianceThreshold object with a variance with a threshold of 0.5 thresholder = VarianceThreshold(threshold=.5) # Conduct variance thresholding X_high_variance = thresholder.fit_transform(X) # View first five rows with features with variances above threshold X_high_variance[0:5]
def removeAtipicalData(X, umbral, exp_min_gen): sel = VarianceThreshold(threshold=(umbral * (1 - umbral))) sel.fit_transform(X) return X[X.columns[sel.get_support(indices=True)]]
def perform_variance_threshold(self, v_threshold): selector = VarianceThreshold(v_threshold) self.train_x = selector.fit_transform(self.train_x, self.train_y) self.test_x = selector.transform(self.test_x)
delimiter=',') train_labels = np.loadtxt('TinyMNIST/trainLabels.csv', dtype=np.int32, delimiter=',') test_data = np.loadtxt('TinyMNIST/testData.csv', dtype=np.float32, delimiter=',') test_labels = np.loadtxt('TinyMNIST/testLabels.csv', dtype=np.int32, delimiter=',') class_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] # Feature Selection tr_samples_size, _ = train_data.shape all_data = np.vstack((train_data, test_data)) sel = VarianceThreshold(threshold=0.90 * (1 - 0.90)) all_data = sel.fit_transform(all_data) train_data = all_data[:tr_samples_size] test_data = all_data[tr_samples_size:] tr_samples_size, feature_size = train_data.shape te_samples_size, _ = test_data.shape print('Train Data Samples:', tr_samples_size, ', Test Data Samples', te_samples_size, ', Feature Size(after feature-selection):', feature_size) # In[4]: types = [] for i in range(10): types.append([])
def variance(self, X, threshold): from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(threshold * (1 - threshold))) sel_var = sel.fit_transform(X) X = self.X[X.columns[sel.get_support(indices=True)]] return X
PDXC = PDXC.loc[:, ~PDXC.columns.duplicated()] GDSCM = pd.read_csv("GDSC_mutations.Paclitaxelv2.tsv", sep="\t", index_col=0, decimal=".") GDSCM = pd.DataFrame.transpose(GDSCM) GDSCC = pd.read_csv("GDSC_CNA.Paclitaxelv2.tsv", sep="\t", index_col=0, decimal=".") GDSCC.drop_duplicates(keep='last') GDSCC = pd.DataFrame.transpose(GDSCC) selector = VarianceThreshold(0.05) selector.fit_transform(GDSCE) GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]] PDXC = PDXC.fillna(0) PDXC[PDXC != 0.0] = 1 PDXM = PDXM.fillna(0) PDXM[PDXM != 0.0] = 1 GDSCM = GDSCM.fillna(0) GDSCM[GDSCM != 0.0] = 1 GDSCC = GDSCC.fillna(0) GDSCC[GDSCC != 0.0] = 1 ls = GDSCE.columns.intersection(GDSCM.columns) ls = ls.intersection(GDSCC.columns) ls = ls.intersection(PDXE.columns)
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, SelectKBest, SelectFromModel, RFE from sklearn.model_selection import cross_val_score as cvs from sklearn.ensemble import RandomForestClassifier import numpy as np import pandas as pd import time from matplotlib import pyplot as plt data = pd.read_csv(r'../data/digit_recognizor.csv') ''' 方差过滤法:过滤掉方差小于一定阈值的特征。和标签无关 此方法对knn,svm,逻辑回归,回归 等需要遍历特征的模型比较使用,主要是用来删除无用特征以减少算法的运行时间。 对随机森林无效,因为随机森林本来就是随机选取部分特征。在sklearn中,单科决策树也是采用随机选择部分特征进行节点的划分。 ''' threshold = 0.1 vt_filter = VarianceThreshold(threshold=threshold) vt_filter.fit(data) remove_features = [ i for i, j in zip(data.columns, vt_filter.variances_) if j <= threshold ] print(remove_features) data_new = vt_filter.transform(data.iloc[10000:, :]) ''' 统计量过滤法: chi2: 卡方检验选择特征,只能做离散型。 f_classif: f检验选择分类特征 mutual_info_classif: 互信息选择特征 以上选择特征的量都是用来检验特征与标签之间的相关性强度。 ''' SelectKBest(score_func=chi2, k=400).fit_transform(X=data_new[:, 1:],
def variance_filter(self,x_train,x_test): # 经过调参,threshold为6e-5时效果最好 selector = VarianceThreshold(6e-5) # 返回方差过滤后的数据集 return selector.fit_transform(x_train),selector.transform(x_test)
import numpy as np import pandas as pd from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB from sklearn.pipeline import make_pipeline # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9800000000000001 exported_pipeline = make_pipeline( VarianceThreshold(threshold=0.1), BernoulliNB(alpha=1.0, fit_prior=True) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# -*- coding: utf-8 -*- import sklearn from sklearn.datasets import load_iris #导入IRIS数据集 iris = load_iris() from sklearn.feature_selection import VarianceThreshold print VarianceThreshold(threshold=3).fit_transform(iris.data) #特征矩阵 iris.data #目标向量 print iris.target
import pandas as pd from sklearn.feature_selection import SelectFwe, VarianceThreshold, f_classif from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import Binarizer, MinMaxScaler from tpot.builtins import StackingEstimator # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.8285375806358702 exported_pipeline = make_pipeline( make_union( MinMaxScaler(), make_pipeline(SelectFwe(score_func=f_classif, alpha=0.048), Binarizer(threshold=0.05))), StackingEstimator( estimator=MLPClassifier(alpha=0.01, learning_rate_init=0.001)), VarianceThreshold(threshold=0.0001), MultinomialNB(alpha=0.1, fit_prior=True)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np from sklearn.cross_validation import train_test_split from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier from sklearn.feature_selection import VarianceThreshold from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( VarianceThreshold(threshold=0.24), ExtraTreesClassifier(criterion="entropy", max_features=0.16, n_estimators=500)) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
#Missing values or little variance #use minimal variance threshold #normalize the variance before we do feature selection = each value / mean(value) and fit #then variance will be lower #drop features: if contains lot of missing value ################################################ #Finding a good variance threshold normalized_df = head_df / np.mean(head_df) #normalized data normalized_df.boxplot() plt.show() print(normalized_df.var()) #variance of normalized data #lowest two variance should be removed #successfully removed the 2 low-variance features from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=0.001)# Create a VarianceThreshold feature selector sel.fit(head_df / head_df.mean())# Fit the selector to normalized head_df mask = sel.get_support()# Create a boolean mask reduced_df = head_df.loc[:, mask]# Apply the mask to create a reduced dataframe #Removing features with many missing values df.isna().sum()#counting missing values df.isna().sum()/len(df) #ratio of missing value mask = df.isna().sum/len(df) <0.3 print(mask) #True or False reduced_df = df.loc[:,mask] # Create a reduced dataset reduced_df.head() ################################################ #Pairwise correlation #measure strength of the correlation ################################################
df = pd.get_dummies(data=df, columns=cols_with_categories) print("Tamaño antes del conjunto de datos despues de recodificar las variables: {}".format(df.shape)) input("\n--- Pulsar tecla para continuar ---\n") #https://stackoverflow.com/questions/44867219/pandas-filling-na-values-to-be-filled-based-on-distribution-of-existing-values # creamos los conjuntos de training y de test X, y = df[df.columns.difference(['income'])], df['income'] X, y = shuffle(X, y, random_state=SEED) train_x, test_x, train_y, test_y = train_test_split(X,y, test_size=0.3, stratify=y) # creamos la pipeline de preprocesado preproc = [ ("var", VarianceThreshold(0.01)), ("standardize", StandardScaler()), ("lasso", SelectFromModel(estimator=LassoCV(tol=0.01))), ] p = Pipeline(preproc) x_train_prep = p.fit_transform(train_x, train_y) print("Descripción de los datos antes y después del preprocesado") print("Antes: {}".format(train_x.shape)) print("Despues: {}".format(x_train_prep.shape)) input("\n--- Pulsar tecla para continuar ---\n") # Modelo lineal
def variance_threshold_selector(data, threshold = 0.5): selector = VarianceThreshold(threshold) selector.fit(data) return data[data.columns[selector.get_support(indices = True)]]
import numpy as np X=np.arange(0,30).reshape(10,3)#function arange() is the abbreviations of array range witch can be used as range() to set array numbers # print(X)#we can consider the arry X as a dataset for 10 instances each of witch has 3 features X[:,1]=-1#change the element in the second column into 1 # print(X)#the second feature for each instance is 1. After the adjustment the variance of feature1, 3 is much bigger than feature2 from sklearn.feature_selection import VarianceThreshold vt=VarianceThreshold()#create an instance witch can select a feature whose variance is big enougth Xt=vt.fit_transform(X) # print(Xt)#the function fit_transform() abandoned feature2 # print(vt.variances_)#.variances_ can calculate the variance for each feature///([ 74.25 0. 74.25]) # Before we analyze the data we should obmit the feature whose variance is 0, otherwise the whole process will be slowdown
# Separate dataset for validation data_submit = data_cl[unknown_mask] # Separate dataset for training X = data_cl[~unknown_mask] Y = target[~unknown_mask] # ### Variance Threshold # Find all features with more than 90% variance in values. # In[ ]: threshold = 0.90 vt = VarianceThreshold().fit(X) # Find feature names feat_var_threshold = data_cl.columns[vt.variances_ > threshold * (1-threshold)] feat_var_threshold # ### Top 20 most important features # According to `RandomForestClassifier` # In[ ]: model = RandomForestClassifier() model.fit(X, Y)
Y = wins.copy().reset_index(drop=True) X = pd.concat([means, stds], axis=1).reset_index(drop=True) # fill in missing values X = X.fillna(method="bfill").fillna(method="ffill") # split the data into training and testing np.random.seed(1) test_idx = np.random.choice(a=X.index.values, size=int(X.shape[0] / 5), replace=False) train_idx = np.array(list(set(X.index.values) - set(test_idx))) # set up a machine learning pipeline pipeline = Pipeline([ ('var', VarianceThreshold()), ('scale', MinMaxScaler()), # ('model', LassoCV(eps=1e-9, n_alphas=16, n_jobs=-1)), # ('model', BayesianRidge()), ('model', RandomForestRegressor(n_estimators=50, max_depth=8, min_samples_leaf=1, n_jobs=-1, random_state=42)), # ('model', MLPRegressor(max_iter=200, hidden_layer_sizes=(128, 128), learning_rate_init=0.001, batch_size=32, activation="relu", solver="adam", learning_rate="adaptive", random_state=42)), ]) # train the model pipeline.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :])
def main(): """Función principal. Ejecuta el proyecto paso a paso. NOTA: Por motivos de unificar el código, todos los clasificadores considerados son un Pipeline, cuyo último paso es el clasificador en sí, con nombre 'clf'.""" # Inicio de medición de tiempo start = default_timer() # Ignorar warnings de convergencia os.environ["PYTHONWARNINGS"] = "ignore::UserWarning" # Semilla aleatoria para reproducibilidad np.random.seed(SEED) # Número de decimales fijo para salida de vectores np.set_printoptions(formatter={'float': lambda x: "{:0.3f}".format(x)}) print( "------- PROYECTO FINAL: AJUSTE DE MODELOS DE CLASIFICACIÓN -------\n") # # LECTURA DE DATOS # # Cargamos los datos de entrenamiento, validación y test (división 50-20-30) print("Leyendo datos de " + DATASET_NAME + "... ", end="", flush=True) X, y, attr_names = read_data(PATH + DATASET_NAME) X_train, X_val, X_test, y_train, y_val, y_test = \ split_data(X, y, val_size = 0.2, test_size = 0.3) X_train_full = np.vstack((X_train, X_val)) y_train_full = np.concatenate((y_train, y_val)) print("Hecho.\n") # # INSPECCIÓN DE LOS DATOS # if SHOW != Show.NONE: print("--- VISUALIZACIÓN DE LOS DATOS ---\n") # Mostramos distribución de clases en training y test print("Mostrando gráfica de distribución de clases...") vs.plot_class_distribution(y_train_full, y_test, N_CLASSES, SAVE_FIGURES, IMG_PATH) # Visualizamos la importancia de las características según RF print("Mostrando gráfica de importancia de características...") pipe = Pipeline([("var", VarianceThreshold()), ("std", StandardScaler())]) X_train_full_pre = pipe.fit_transform(X_train_full) rf = RandomForestClassifier(200, random_state=SEED, max_depth=20, n_jobs=-1) rf.fit(X_train_full_pre, y_train_full) vs.plot_feature_importance(rf.feature_importances_, n=X_train_full_pre.shape[1], pca=False, save_figures=SAVE_FIGURES, img_path=IMG_PATH) # Mostramos gráficas de preprocesado print( "Mostrando matrices de correlación antes y después de cada preprocesado..." ) preprocess_graphs(X_train_full) if SHOW == Show.ALL: # Visualizamos el conjunto de entrenamiento en 2 dimensiones print( "Mostrando proyección del conjunto de entrenamiento en dos dimensiones..." ) vs.plot_tsne(X_train_full, y_train_full, SAVE_FIGURES, IMG_PATH) if DO_MODEL_SELECTION: clfs = fit_model_selection(X_train, X_val, y_train, y_val) else: clfs = fit_models(X_train_full, y_train_full) # # COMPARACIÓN DE MODELOS # print("--- COMPARACIÓN DE LOS MEJORES MODELOS ---\n") compare(clfs, X_train_full, X_test, y_train_full, y_test) # Imprimimos tiempo total de ejecución elapsed = default_timer() - start print("Tiempo total de ejecución: {:.3f} min".format(elapsed / 60.0))
data_minmax['V1'] = data_minmax['V1'].apply(lambda x: math.exp(x)) data_minmax['V6'] = data_minmax['V6'].apply(lambda x: math.exp(x)) data_minmax['V30'] = np.log1p(data_minmax['V30']) X_scaled = pd.DataFrame(preprocessing.scale(data_minmax), columns=data_minmax.columns) train_x = X_scaled.ix[0:len(df_train) - 1] test = X_scaled.ix[len(df_train):] Y = df_train['target'] ## feature selection---Through the variance threshold from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression threshold = 0.85 vt = VarianceThreshold().fit(train_x) feat_var_threshold = train_x.columns[vt.variances_ > threshold * (1 - threshold)] train_x = train_x[feat_var_threshold] test = test[feat_var_threshold] ## single feature---Select features according to the k highest scores. # see detail -- https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html X_scored = SelectKBest(score_func=f_regression, k=10).fit( train_x, Y) # F-value between label/feature for regression tasks. print(X_scored) feature_scoring = pd.DataFrame({ 'feature': train_x.columns, 'score': X_scored.scores_ }) head_feature_num = 18
for elem in dictionary: target_vec.append(dictionary[elem]["SIN"]) del dictionary[elem]["SIN"] print("Vectorizando...") wordindex = list([word for word in dictionary]) # guardamos los indices feat_dict = [dictionary[word] for word in dictionary.keys()] dv = DictVectorizer(sparse=False) word_vectors = dv.fit_transform(feat_dict) print("Normalizando...") vec_sums = word_vectors.sum(axis=1) word_vectors = word_vectors / vec_sums[:, numpy.newaxis] print("Reduciendo dimensionalidad...") selector = VarianceThreshold(threshold=0.000000001) new_word_vecs = selector.fit_transform(word_vectors) #selected = SelectPercentile(chi2, percentile = 10) #word_vecs_new=selected.fit_transform(new_word_vecs,target_vec) if mode: selected = SelectKBest(chi2, k=800) else: selected = SelectKBest(chi2, k=5000) word_vecs_new = selected.fit_transform(new_word_vecs, target_vec) print("Kmeans...") kmwv = numpy.array(word_vecs_new) clusters_size = 45 #CANTIDAD DE CLUSTERS A USAR kmeans = KMeans(clusters_size, max_iter=500, random_state=0).fit(kmwv)
# -*- coding: utf-8 -*- ########################################################################## # Project: COMP6004 - Machine learning pipeline for data analysis # File: 03-featureExtraction.py # Author: Diego Bueno - [email protected] # Date: 20/04/2021 # Description: Applying feature extraction to step03 of ML pipeline. # ########################################################################## # Maintenance # Author: # Date: # Description: A # ##########################################################################> import numpy as np import pandas as pd from functions import openfile from functions import savefile from functions import convert from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.neighbors import LocalOutlierFactor from sklearn.metrics import mean_absolute_error #calling the function to Load data pre-reading on task 1 print("\nReading the step02 file\n") db = openfile('data/step02.csv') print("\nChecking the current shape of the data:") rows, columns = db.shape
def run_main(): """ 主函数 """ if is_first_run: # 1. 分割数据集 print('分割数据集') all_gender_age = pd.read_csv( os.path.join(dataset_path, gender_age_filename)) df_train, df_test = split_train_test(all_gender_age) # 查看训练集测试集基本信息 print('训练集中各类的数据个数:', df_train.groupby('group').size()) print('测试集中各类的数据个数:', df_test.groupby('group').size()) # 保存分割的数据集 df_train.to_csv(os.path.join(dataset_path, train_gender_age_filename), index=False) df_test.to_csv(os.path.join(dataset_path, test_gender_age_filename), index=False) # 2. 加载数据 print('加载数据') # 加载数据 gender_age_train = pd.read_csv(os.path.join(dataset_path, train_gender_age_filename), index_col='device_id') gender_age_test = pd.read_csv(os.path.join(dataset_path, test_gender_age_filename), index_col='device_id') # 选取部分数据用于实验 percent = 0.1 gender_age_train = get_part_data(gender_age_train, percent=percent) gender_age_test = get_part_data(gender_age_test, percent=percent) phone_brand_device_model = pd.read_csv( os.path.join(dataset_path, phone_brand_device_model_filename)) # 去掉重复数据 phone_brand_device_model = phone_brand_device_model.drop_duplicates( 'device_id').set_index('device_id') events = pd.read_csv(os.path.join(dataset_path, events_filename), usecols=['device_id', 'event_id'], index_col='event_id') app_events = pd.read_csv(os.path.join(dataset_path, app_events_filename), usecols=['event_id', 'app_id']) # app_labels = pd.read_csv(os.path.join(dataset_path, app_labels_filename)) # 3. 特征工程 # 3.1 手机品牌特征 # 使用LabelEncoder将类别转换为数字 brand_label_encoder = LabelEncoder() brand_label_encoder.fit(phone_brand_device_model['phone_brand'].values) phone_brand_device_model['brand_label_code'] = \ brand_label_encoder.transform(phone_brand_device_model['phone_brand'].values) gender_age_train['brand_label_code'] = phone_brand_device_model[ 'brand_label_code'] gender_age_test['brand_label_code'] = phone_brand_device_model[ 'brand_label_code'] # 使用OneHotEncoder将数字转换为OneHot码 brand_onehot_encoder = OneHotEncoder() brand_onehot_encoder.fit( phone_brand_device_model['brand_label_code'].values.reshape(-1, 1)) tr_brand_feat = brand_onehot_encoder.transform( gender_age_train['brand_label_code'].values.reshape(-1, 1)) te_brand_feat = brand_onehot_encoder.transform( gender_age_test['brand_label_code'].values.reshape(-1, 1)) print('[手机品牌]特征维度:', tr_brand_feat.shape[1]) # 3.2 手机型号特征 # 合并手机品牌与型号字符串 phone_brand_device_model['brand_model'] = \ phone_brand_device_model['phone_brand'].str.cat(phone_brand_device_model['device_model']) # 使用LabelEncoder将类别转换为数字 model_label_encoder = LabelEncoder() model_label_encoder.fit(phone_brand_device_model['brand_model'].values) phone_brand_device_model['brand_model_label_code'] = \ model_label_encoder.transform(phone_brand_device_model['brand_model'].values) gender_age_train['brand_model_label_code'] = phone_brand_device_model[ 'brand_model_label_code'] gender_age_test['brand_model_label_code'] = phone_brand_device_model[ 'brand_model_label_code'] # 使用OneHotEncoder将数字转换为OneHot码 model_onehot_encoder = OneHotEncoder() model_onehot_encoder.fit( phone_brand_device_model['brand_model_label_code'].values.reshape( -1, 1)) tr_model_feat = model_onehot_encoder.transform( gender_age_train['brand_model_label_code'].values.reshape(-1, 1)) te_model_feat = model_onehot_encoder.transform( gender_age_test['brand_model_label_code'].values.reshape(-1, 1)) print('[手机型号]特征维度:', tr_model_feat.shape[1]) # 3.3 安装app特征 device_app = app_events.merge(events, how='left', left_on='event_id', right_index=True) # 运行app的总次数 n_run_s = device_app['app_id'].groupby(device_app['device_id']).size() # 运行app的个数 n_app_s = device_app['app_id'].groupby(device_app['device_id']).nunique() gender_age_train['n_run'] = n_run_s gender_age_train['n_app'] = n_app_s # 填充缺失数据 gender_age_train['n_run'].fillna(0, inplace=True) gender_age_train['n_app'].fillna(0, inplace=True) gender_age_test['n_run'] = n_run_s gender_age_test['n_app'] = n_app_s # 填充缺失数据 gender_age_test['n_run'].fillna(0, inplace=True) gender_age_test['n_app'].fillna(0, inplace=True) tr_run_feat = gender_age_train['n_run'].values.reshape(-1, 1) tr_app_feat = gender_age_train['n_app'].values.reshape(-1, 1) te_run_feat = gender_age_test['n_run'].values.reshape(-1, 1) te_app_feat = gender_age_test['n_app'].values.reshape(-1, 1) # 3.4 合并所有特征 tr_feat = np.hstack((tr_brand_feat.toarray(), tr_model_feat.toarray(), tr_run_feat, tr_app_feat)) te_feat = np.hstack((te_brand_feat.toarray(), te_model_feat.toarray(), te_run_feat, te_app_feat)) print('特征提取结束') print('每个样本特征维度:', tr_feat.shape[1]) # 3.5 特征范围归一化 scaler = StandardScaler() tr_feat_scaled = scaler.fit_transform(tr_feat) te_feat_scaled = scaler.transform(te_feat) # 3.6 特征选择 sel = VarianceThreshold(threshold=(.8 * (1 - .8))) tr_feat_scaled_sel = sel.fit_transform(tr_feat_scaled) te_feat_scaled_sel = sel.transform(te_feat_scaled) # 3.7 PCA降维操作 pca = PCA(n_components=0.95) # 保留95%共享率的特征向量 tr_feat_scaled_sel_pca = pca.fit_transform(tr_feat_scaled_sel) te_feat_scaled_sel_pca = pca.transform(te_feat_scaled_sel) print('特征处理结束') print('处理后每个样本特征维度:', tr_feat_scaled_sel_pca.shape[1]) # 4 为数据添加标签 group_label_encoder = LabelEncoder() group_label_encoder.fit(gender_age_train['group'].values) y_train = group_label_encoder.transform(gender_age_train['group'].values) y_test = group_label_encoder.transform(gender_age_test['group'].values) # 5. 训练模型 # 5.1 逻辑回归模型 print('训练逻辑回归模型...') lr_param_grid = [{'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}] lr_model = LogisticRegression() best_lr_model = get_best_model(lr_model, tr_feat_scaled_sel_pca, y_train, lr_param_grid, cv=3) y_pred_lr = best_lr_model.predict_proba(te_feat_scaled_sel_pca) # 5.2 SVM print('训练SVM模型...') svm_param_grid = [ { 'C': [1e-2, 1e-1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] # 设置probability=True用于输出预测概率 svm_model = svm.SVC(probability=True) best_svm_model = get_best_model(svm_model, tr_feat_scaled_sel_pca, y_train, svm_param_grid, cv=3) y_pred_svm = best_svm_model.predict_proba(te_feat_scaled_sel_pca) # 6. 查看结果 print('逻辑回归模型 logloss:', log_loss(y_test, y_pred_lr)) print('SVM logloss:', log_loss(y_test, y_pred_svm))
""" # 方差选择法 先要计算各个特征的方差,然后根据阈值,选择方差大于阈值的特征。 使用feature_selection库的VarianceThreshold类来选择特征 代码如下: """ from sklearn.feature_selection import VarianceThreshold from sklearn.datasets import load_iris iris = load_iris() print(iris.data[0:5]) """ # 方差选择法,返回值为特征选择后的数据 # 参数threshold为方差的阈值 """ # fit数据 selector = VarianceThreshold(threshold=3).fit(iris.data, iris.target) # 转换数据 data = selector.transform(iris.data) print(data[0:5]) print(selector.variances_)
f"var_thresh={var_thresh}-train_size={train_size}-n_components={n_components}" f"-max_iter={max_iter}-with_std={with_std}-seed={seed}") output_dir = output_dir / global_params if not os.path.isdir(output_dir): print(f"{output_dir} is not a directory... creating.") os.mkdir(output_dir) os.mkdir(output_dir / "data") os.mkdir(output_dir / "models") #%% sequencing_df, annotation_df = load_scRNAseq(fillna=True) #%% throw out some genes with low variance X = sequencing_df.values.copy() var_thresh = VarianceThreshold(threshold=var_thresh) X = var_thresh.fit_transform(X) gene_index = sequencing_df.columns original_n_genes = len(gene_index) gene_index = gene_index[var_thresh.get_support()] sequencing_df = sequencing_df[gene_index] new_n_genes = len(gene_index) print(f"Number of genes removed: {original_n_genes - new_n_genes} " f"out of {original_n_genes}") #%% np.random.seed(seed) neuron_index = sequencing_df.index y = sequencing_df.index.get_level_values(level="Neuron_type").values
def __init__(self): #pca_boi = PCA(n_components=8) filtering = VarianceThreshold(threshold=4) #self.transformer = Pipeline([("first", filtering),("second", pca_boi)]) #self.transformer = pca_boi self.transformer = filtering