def featureSelection(reduced_features, labels, clnd_features, percentile, n_components, results=False): """ Parameters: reduced_features = Unique feature names in python list after dropping non-numeric feaures. labels = ground truth labels for the data points. clnd_features = data point features in numpy array format corresponding to the labels. percentile= the parameter for the SelectPercentile method; between 0.0-1.0. n_components = the n_components for the pca. results = False returns python list of selected features. If True returns the metrics of the feature selectors (F-statistic, and p-values from f_classif) and the top 'n' pca component variance measurements. Output: Resulting list of feature from the SelectPercentile function and the number of principle components used. If p_results = True then the statistics of the SelectPercentile method using f_classif will be printed. In addition the explained variance of the top 'x' principle components will also be printed. """ from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.decomposition import PCA from itertools import compress selector = SelectPercentile(f_classif, percentile=percentile) selector.fit_transform(clnd_features, labels) pca = PCA(n_components=n_components) pca.fit_transform(clnd_features, labels) if results == True: f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\ key = lambda x: x[1], reverse=True) p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\ key = lambda x: x[1]) expl_var = pca.explained_variance_ratio_ return f_stat, p_vals, expl_var else: ## return a boolean index of the retained features retained_features = selector.get_support() ## index the original features by the boolean index of top x% features ## return a python list of the features to be used for training features_list = list(compress(reduced_features[1:], retained_features)) ## add back in the 'poi' to the first position in the final features list features_list.insert(0, 'poi') return features_list
def percentile_k_features(df, k=20): y = data.pop('SalePrice') X = data sel_feat = SelectPercentile(score_func=f_regression, percentile=k) sel_feat.fit_transform(X, y) n = sel_feat.get_support(indices=True).size return list(X.columns[np.argsort(sel_feat.scores_)[::-1]][:n])
def reduce_features(self, X, y, percentile=10): reduce_features = SelectPercentile(chi2, percentile=percentile) reduce_features.fit_transform(X, y) mask = list(reduce_features.get_support()) X_new = X.loc[:, mask] return X_new
class multiple_classifiers1(abstract_classifier): def __init__(self, data, labels): self.ada = AdaBoostClassifier() self.knn = KNeighborsClassifier(n_neighbors=1) self.perceptron = Perceptron(tol=1e-3) self.sp_knn = SelectPercentile(percentile=24) self.sp_ada = SelectPercentile(percentile=85) self.sp_percep = SelectPercentile(percentile=35) data_knn = self.sp_knn.fit_transform(data, labels) data_ada = self.sp_ada.fit_transform(data, labels) data_percep = self.sp_percep.fit_transform(data, labels) self.knn.fit(data_knn, labels) self.ada.fit(data_ada, labels) self.perceptron.fit(data_percep, labels) def classify(self, features): features_mat = features.reshape((1, -1)) features_knn = self.sp_knn.transform(features_mat) features_ada = self.sp_ada.transform(features_mat) features_percep = self.sp_percep.transform(features_mat) p1 = int(self.knn.predict(features_knn)[0]) p2 = int(self.ada.predict(features_ada)[0]) p3 = int(self.perceptron.predict(features_percep)[0]) avg = (p1 + p2 + p3)/3 return bool(np.round(avg))
def percentile_k_features(df, k=20): X = df.iloc[:, :-1] y = df.iloc[:, -1] features = X.columns sp = SelectPercentile(f_regression, percentile=k) sp.fit_transform(X, y) imp_features = [features[i] for i in np.argsort(sp.scores_)[::-1]] return imp_features[:7]
def percentile_k_features(X, y, k=50): sp = SelectPercentile(f_regression, percentile=k) sp.fit_transform(X, y) features = X.columns.values[sp.get_support()] scores = sp.scores_[sp.get_support()] fs_score = list(zip(features, scores)) df = pd.DataFrame(fs_score, columns=['Name', 'Score']) return df.sort_values(['Score', 'Name'], ascending=[False, True])['Name'].tolist()
def percentile_k_features(x_train, y_train, k=50): selector = SelectPercentile(f_regression,percentile=k) selector.fit_transform(x_train, y_train) scores = selector.scores_[selector.get_support()] features = x_train.columns.values[selector.get_support()] features_scores_list = list(zip(features,scores)) df = pd.DataFrame(features_scores_list, columns=['Features','Scores']) sorted_list = df.sort_values('Scores',ascending=False) top_k_predictors = list(sorted_list['Features']) return top_k_predictors
def fit(self): selector = SelectPercentile(f_classif, self.percent) # 选择50的变量 selector.fit_transform(self.X, self.Y) self.pvalues = selector.pvalues_ self.indx = np.argwhere(selector.get_support())[:, 0] scores = -np.log10(self.pvalues) #得到每个变量重要性p值的对数 scores /= scores.max() self.scores = scores return self.pvalues, self.indx
def select_features_from_model(self, x, y, percentile=10): score_func = chi2 selector = SelectPercentile(score_func=score_func, percentile=percentile) selector.fit_transform(x, y) features = selector.get_support(indices=True) self.best_features = [column for column in x.columns[features]] x_select = self.select_features_in_test_set(x) return x_select
def featureSelection(reduced_features,labels,clnd_features,percentile,n_components,results=False): """ Parameters: reduced_features = Unique feature names in python list after dropping non-numeric feaures. labels = ground truth labels for the data points. clnd_features = data point features in numpy array format corresponding to the labels. percentile= the parameter for the SelectPercentile method; between 0.0-1.0. n_components = the n_components for the pca. results = False returns python list of selected features. If True returns the metrics of the feature selectors (F-statistic, and p-values from f_classif) and the top 'n' pca component variance measurements. Output: Resulting list of feature from the SelectPercentile function and the number of principle components used. If p_results = True then the statistics of the SelectPercentile method using f_classif will be printed. In addition the explained variance of the top 'x' principle components will also be printed. """ from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.decomposition import PCA from itertools import compress selector = SelectPercentile(f_classif, percentile=percentile) selector.fit_transform(clnd_features, labels) pca = PCA(n_components = n_components) pca.fit_transform(clnd_features, labels) if results == True: f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\ key = lambda x: x[1], reverse=True) p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\ key = lambda x: x[1]) expl_var = pca.explained_variance_ratio_ return f_stat,p_vals,expl_var else: ## return a boolean index of the retained features retained_features = selector.get_support() ## index the original features by the boolean index of top x% features ## return a python list of the features to be used for training features_list = list(compress(reduced_features[1:],retained_features)) ## add back in the 'poi' to the first position in the final features list features_list.insert(0,'poi') return features_list
def percentile_k_features(X, y, k=50): selector = SelectPercentile(f_regression, percentile=k) selector.fit_transform(X, y) names = X.columns.values[selector.get_support()] scores = selector.scores_[selector.get_support()] names_scores = list(zip(names, scores)) ns_df = pd.DataFrame(data=names_scores, columns=['Feat_names', 'F_Scores']) #Sort the dataframe for better visualization ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending=[False, True]) return ns_df_sorted['Feat_names'].tolist()
def percentile_k_features(df, k=20): X, y = df.iloc[:, :-1], df.iloc[:, -1] fs = SelectPercentile(f_regression, percentile=k) fs.fit_transform(X, y) support = fs.get_support() h = fs.scores_ temp1 = h[support] full_set = X.columns.values temp2 = full_set[support] finallist1 = temp1.tolist() finallist2 = temp2.tolist() z = [x for _, x in sorted(zip(finallist1, finallist2), reverse=True)] return z
def percentile_k_features(df, k=20): predictors = df.drop(['SalePrice'], axis=1) target_variable = df['SalePrice'] selector = SelectPercentile(f_regression, percentile=k) selector.fit_transform(predictors, target_variable) names = predictors.columns.values[selector.get_support()] scores = selector.scores_[selector.get_support()] names_scores = list(zip(names, scores)) ns_df = pd.DataFrame(data=names_scores, columns=['Feat_names', 'F_Scores']) #Sort the dataframe for better visualization ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending=[False, True]) return ns_df_sorted['Feat_names'].tolist()
def univariate_feature_selection(mode, predictors, target): if mode == 'f_regression': fselect = SelectPercentile(f_regression, 100) if mode == 'f_classif': fselect = SelectPercentile(f_classif, 100) if mode == 'chi2': fselect = SelectPercentile(chi2, 100) fselect.fit_transform(predictors, target) return fselect.pvalues_
def get_semmed_features(semmed_X, features, y): print("Extracting features '{}'".format(features)) if "all" in features: X = semmed_X else: # Use DataFrame to preserve feature names. X = pd.DataFrame(index=range(semmed_X.shape[0])) if "cui_feature" in features: cui_X = semmed_X.filter(regex=(r'(SU|O)BJECT_CUI=.*')) X = pd.concat([X, cui_X], axis=1) if "cui2_feature" in features: raise NotImplementedError("cui2_feature") cui2_feature = pd.DataFrame(semmed_data["SUBJECT_CUI"].str.cat( semmed_data["OBJECT_CUI"], sep='_')).to_dict('records') cui2_feature = DictVectorizer( sparse=False).fit_transform(cui2_feature) feature_finder = SelectPercentile(chi2, percentile=10) cui2_feature = feature_finder.fit_transform(cui2_feature, y) X = np.hstack((X, cui2_feature)) if "dist_feature" in features: dist_X = semmed_X.filter(regex=(r'(SU|O)BJECT_DIST')) X = pd.concat([X, dist_X], axis=1) if "dist2_feature" in features: raise NotImplementedError("dist2_feature") dist2_feature = np.abs( semmed_data["SUBJECT_START_INDEX"] - semmed_data["OBJECT_START_INDEX"]).values.reshape(-1, 1) dist2_feature = OneHotEncoder( sparse=False).fit_transform(dist2_feature) feature_finder = SelectPercentile(chi2, percentile=10) dist2_feature = feature_finder.fit_transform(dist2_feature, y) X = np.hstack((X, dist2_feature)) if "pred_feature" in features: pred_X = semmed_X.filter(regex=(r'PREDICATE=.*')) X = pd.concat([X, pred_X], axis=1) if "ind_feature" in features: ind_X = semmed_X.filter(regex=(r'INDICATOR_TYPE=.*')) X = pd.concat([X, ind_X], axis=1) if "novelty_feature" in features: nov_X = semmed_X.filter(regex=(r'(SU|O)BJECT_NOVELTY')) X = pd.concat([X, nov_X], axis=1) if "novelty2_feature" in features: raise NotImplementedError("novelty2_feature") nov2_feature = (semmed_data["SUBJECT_NOVELTY"] + semmed_data["OBJECT_NOVELTY"]).values.reshape( -1, 1) X = np.hstack((X, nov2_feature)) print(X.shape) return X
def feature_selection_percentile(self): feature_names = [ 'teff', 'logg', 'feh', 'alpha', 'teff**2', 'logg**2', 'feh**2', 'alpha**2', 'teff*logg', 'teff*feh', 'logg*feh', 'teff*alpha', 'alpha*feh', 'logg*alpha' ] selector = SelectPercentile(f_regression, percentile=20) y = self.y.values totalscore = [] for i, yy in enumerate(y): selector.fit_transform(self.X, y[:, i]) names = [ feature_names[i] for i in np.argsort(selector.scores_)[::-1] ] totalscore.append(selector.scores_)
def percentile_k_features(df, K=20): x = df.iloc[:,:-1] y = df.iloc[:,-1] #selecting features on the basis of p-value i.e whose value less than percentile is true best_feature = SelectPercentile(f_regression, percentile=K) #selecting best features from X best_feature.fit_transform(x,y) #creating dataframe from score, get_support, result d = {'support': best_feature.get_support(),'values':best_feature.scores_} df1 = pd.DataFrame(d,index = x.columns) #sorting values according get_support df1 = df1.sort_values('values', ascending=False) #selecting only rows whose value of support is True col = df1[df1.support].index return list(col) # returning list of features
def percentile_k_features(X, y, k=50): lst = [] fs = SelectPercentile(f_regression, percentile=k) fs.fit_transform(X, y) col_nam = X.columns.values[fs.get_support()] col_scr = fs.scores_[fs.get_support()] nam_scr = list(zip(col_nam, col_scr)) #print nam_scr srt_nam_scr = sorted(nam_scr, key=lambda x: x[1], reverse=True) for i in srt_nam_scr: lst.append(i[0]) return lst
def test_select_percentile_chi2(self): X, y = load_digits(return_X_y=True) selector = SelectPercentile(chi2, percentile=15) selector.fit_transform(X, y) data_tensor = torch.from_numpy(X) torch_model = hummingbird.ml.convert(selector, "torch") self.assertIsNotNone(torch_model) np.testing.assert_allclose( selector.transform(X), torch_model.transform(data_tensor), rtol=1e-06, atol=1e-06, )
def feature_select(self): b = SelectPercentile(f_classif, percentile=task.percentile) y = np.array(self.results[self.task.label].data) X = np.array(self.results[self.task.features].data) data = pd.DataFrame(b.fit_transform(X, y)) result = TransformResult(self.task, 1.0, data) self.results[self.task.uuid] = result
def build_data(percentile_of_features): with open("data/email_authors.pkl", 'rb') as authors_file, open("data/word_data.pkl", 'rb') as word_file: email_authors = pickle.load(authors_file) word_data = pickle.load(word_file) # split into training and test features_train, features_test, labels_train, labels_test = train_test_split( word_data, email_authors, test_size=0.1, random_state=42) # tokenize emails vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) # only use top 10% of features selector = SelectPercentile(percentile=percentile_of_features) features_train_transformed = selector.fit_transform( features_train_transformed, labels_train).toarray() features_test_transformed = selector.transform( features_test_transformed).toarray() return [ features_train_transformed, features_test_transformed, labels_train, labels_test ]
def percentile_k_features(data, k=20): X = data.drop(['SalePrice'], axis=1) y = data['SalePrice'] SP = SelectPercentile(f_regression, percentile=k) model = SP.fit_transform(X, y) return model
def get_features_by_score(self): selector = SelectPercentile() features = selector.fit_transform(X=self.X_train, y=self.Y_train) feature_names = self.X_train.columns.tolist() self.allFeatureByScore = [ feature_names[i] for i in np.argsort(selector.scores_)[::-1] ]
def percentile_k_features(df, k=20): X = data.iloc[:, :-1] y = data.iloc[:, -1] feature_sel = SelectPercentile(f_regression, percentile=k) a = feature_sel.fit_transform(X, y) #print a column = data.columns feature_ind = feature_sel.get_support(indices=True) feature_score = feature_sel.scores_[feature_ind] #feature_ind = [column[x] for x in feature_sel.get_support(indices=True) if x] #print feature_ind #print feature_score feature_ind_score = zip(feature_score, feature_ind) feature_ind_score.sort(key=itemgetter(0), reverse=True) #print feature_ind_score k_score, k_ind = zip(*feature_ind_score) #print k_ind k_features = [] for x in k_ind: k_features.append(column[x]) #fis_sort = sorted(feature_ind_score.values(), key=operator.itemgetter(0),reverse=True) #k_ind = [feature_ind for feature_ind in feature_ind_score] #print fis_sort return k_features
def selectTrainEvaluate(self, trainData, trainTarget, testData, classifier): feature_grid = [10, 20, 30, 40, 50, 60, 70] scoreMax = 0 bestSelector = ' ' bestPca = ' ' bestSvc = ' ' bestPerc = 0 for item in feature_grid: self.logger.log("INFO", "TRYING FEATURE PERCENTILE " + str(item)) selector = SelectPercentile(mutual_info_classif, percentile=item) trainDataSel = selector.fit_transform(trainData, trainTarget) pca = PCA(n_components=0.99, svd_solver='full') trainDataSel = pca.fit_transform(trainDataSel) result = self.trainClassifier(trainDataSel, trainTarget, classifier) self.logger.log( "INFO", "DONE FEATURE PERCENTILE " + str(item) + " SCORE: " + str(result["score"])) if (result["score"] > scoreMax): bestPerc = item scoreMax = result["score"] bestSelector = selector bestPca = pca bestSvc = result["svc"] self.logger.log("INFO", "DONE FEATURE CROSS VALIDATION") self.logger.log("INFO", "BEST RESULT WITH " + str(bestPerc) + " PERCENTILE") testDataSel = bestSelector.transform(testData) testData = bestPca.transform(testDataSel) pred = bestSvc.predict(testData) return (pred, bestSvc, bestSelector, bestPca)
def ridge_make_submission(): train, test = read_csv() x_train, y_train = make_train_set(train) x_test, y_test = make_train_set(test) y_train = y_train['Y'] # feature selection sel = SelectPercentile(f_regression, 70) x_train = sel.fit_transform(x_train, y_train) x_test = sel.transform(x_test) model = linear_model.Ridge(normalize=True) model.fit(x_train, y_train) preds = model.predict(x_test) # preds[preds < 0] = 0 y_test['Y'] = preds print(y_test['Y'].var()) y_test.columns = ['TERMINALNO', 'Pred'] y_test.set_index('TERMINALNO', inplace=True) # x_test = pd.merge(x_test, y_test, left_index=True, right_index=True) # x_test.set_index('TERMINALNO', inplace=True) # print(x_test.head()) y_test.to_csv(path_test_out, columns=['Pred'], index=True, index_label=['Id'])
def preprocess(word_data, targets): print("\n### PREPROCESSING DATA ###") # vectorize print("-- Vectorization") vectorizer = TfidfVectorizer(sublinear_tf=True) # , stop_words='english' data_transformed = vectorizer.fit_transform(word_data) # feature selection print("-- Feature Selection") selector = SelectPercentile(percentile=5) data_selected = selector.fit_transform(data_transformed, targets) if data_selected.shape[1] == 0: data_selected = data_transformed else: print("Top {} features were selected".format(data_selected.shape[1])) # print top features nr_features = 30 i = selector.scores_.argsort()[::-1][:nr_features] top_features = np.column_stack((np.asarray(vectorizer.get_feature_names())[i], selector.scores_[i], selector.pvalues_[i])) print("\nTop %i Features:" % nr_features) print(pd.DataFrame(top_features, columns=["token", "score", "p-val"]), "\n") features_train, features_test, labels_train, labels_test = \ train_test_split(data_selected, targets, test_size=0.2, stratify=targets) return features_train, features_test, labels_train, labels_test
def main(): main_data = pd.read_csv('../data/train.csv', index_col='ID') output = [] for x in main_data.columns: output.append({ 'variable': x, 'variance': main_data.ix[:, x].var(), 'corr_w_target': round(main_data.ix[:, x].corr(main_data.TARGET), 4), 'abs_corr': abs(round(main_data.ix[:, x].corr(main_data.TARGET), 4))} ) # print csv for later in the presentation docs variable_selector = pd.DataFrame(output) variable_selector = variable_selector.set_index('variable') variable_selector = variable_selector.drop('TARGET') variable_selector.sort_values('abs_corr', ascending=False).to_csv('../presentationDocs/corrs.csv') selector = SelectPercentile(f_classif, percentile=25) subset = pd.DataFrame(selector.fit_transform(main_data.drop('TARGET', axis=1), main_data['TARGET'])) subset.to_csv('../data/main_data.csv', index=False) main_data[['TARGET']].to_csv('../data/target.csv', cols=['TARGET'], index=False) # print transformed test data to csv test_data = pd.read_csv('../data/test.csv', index_col='ID') test_data = pd.DataFrame(selector.transform(test_data), index=test_data.index) test_data.to_csv('../data/test_transform.csv', index=True, index_label='ID')
def buildVectorizer(classes, examples, parameters): featureChoice = None doFeatureSelection = False tfidf = False featureSelectPerc = 10 if "featureChoice" in parameters: featureChoice = parameters["featureChoice"] if "doFeatureSelection" in parameters and parameters["doFeatureSelection"] == "True": doFeatureSelection = True if "featureSelectPerc" in parameters: featureSelectPerc = int(parameters["featureSelectPerc"]) if "tfidf" in parameters and parameters["tfidf"] == "True": tfidf = True print "Starting vectorizer..." vectorizer = Vectorizer(classes,examples,featureChoice,tfidf) vectors = vectorizer.getTrainingVectors() print "Vectors of size:", vectors.shape if doFeatureSelection: print "Trimming training vectors..." from sklearn.feature_selection import SelectKBest,SelectPercentile,chi2 #featureSelector = SelectKBest(chi2, k=100)`: featureSelector = SelectPercentile(chi2,featureSelectPerc) vectorsTrimmed = featureSelector.fit_transform(vectors, classes) vectorsTrimmed = coo_matrix(vectorsTrimmed) print "Trimmed training vectors of size:", vectorsTrimmed.shape else: vectorsTrimmed = vectors featureSelector = None return vectorsTrimmed,vectorizer,featureSelector
def select_features(filename, column, percentile, features_outfile=None): ''' Selects the top <percentile> features from the dataset. :param str filename: Training data file. :param str column: Column in the CSV to use. If 'all' use all columns. :param int percentile: Percentile top features to choose. :returns: Training data with top percentile features. Labels. Names of selected features. :rtype: 3-tuple ''' train = pd.read_csv(filename, sep=',', compression="infer") targets = LabelBinarizer().fit_transform(train["T/F"]) targets = np.ravel(targets) train.drop(["T/F"], axis=1, inplace=True) # Encase the feature names in quotes to ensure proper parsing later. feature_names = np.array( ['"{}"'.format(f) for f in np.array(train.columns)]) train = train.values.astype('double') if percentile < 100: feature_finder = SelectPercentile(f_classif, percentile=percentile) train = feature_finder.fit_transform(train, targets) support = feature_finder.get_support() scores = feature_finder.scores_ pvals = feature_finder.pvalues_ feature_names = feature_names[support] if features_outfile is not None: feature_scores = scores[support] feature_pvals = pvals[support] features = zip(features, feature_scores, feature_pvals) rank = sorted(features, key=lambda x: x[1], reverse=True) with open(features_outfile, 'w') as outF: for feat in rank: outF.write("{} :: {:g} :: {:g}\n".format(*feat)) return train, targets, feature_names
def final_feature_set_reduction(reduced_feature_file_name, final_file_name, train_label_file): sorted_train_data = pd.read_csv('data/' + reduced_feature_file_name) y = [] X = sorted_train_data.iloc[:,1:] fip = open('data/' + train_label_file) lines = fip.readlines() for line in lines: line = line.rstrip() y.append(int(line)) print("Final feature reduction: {:s}".format(reduced_feature_file_name)) print("Training labels length: {:d}".format(len(y))) print("X Feature set dimensionality: {:d} {:d}".format(X.shape[0], X.shape[1])) print("In Feature set dimensionality: {:d} {:d}".format(sorted_train_data.shape[0], sorted_train_data.shape[1])) # find the top 10 percent variance features, from ~1000 -> ~100 features fsp = SelectPercentile(chi2, 10) X_new_10 = fsp.fit_transform(X,y) print("Final 10 Percent Dimensions: {:d} {:d}".format(X_new_10.shape[0], X_new_10.shape[1])) selected_names = fsp.get_support(indices=True) selected_names = selected_names + 1 #data_reduced = sorted_train_data.iloc[:,[0] + selected_names] #Does not put the file_name as the first column. data_trimmed = sorted_train_data.iloc[:,selected_names] data_fnames = pd.DataFrame(sorted_train_data['file_name']) data_reduced = data_fnames.join(data_trimmed) data_reduced.to_csv('data/' + final_file_name, index=False) print("Completed reduction in {:s}".format(final_file_name)) return
def selectionPercentile(X, y, paramlist): percentile = paramlist['percentile'] spc = SelectPercentile(chi2, percentile=percentile) Xnew = spc.fit_transform(X, y) indexarr = spc.get_support(indices=True) scores_arr = spc.scores_ return [Xnew, indexarr, scores_arr]
def final_feature_set_reduction(reduced_feature_file_name, final_file_name, train_label_file): sorted_train_data = pd.read_csv('data/' + reduced_feature_file_name) y = get_training_labels('data/' + reduced_feature_file_name, train_label_file) X = sorted_train_data.iloc[:,1:] print("Final feature reduction: {:s}".format(reduced_feature_file_name)) print("Training labels length: {:d}".format(len(y))) print("X Feature set dimensionality: {:d} {:d}".format(X.shape[0], X.shape[1])) print("In Feature set dimensionality: {:d} {:d}".format(sorted_train_data.shape[0], sorted_train_data.shape[1])) # find the top 10 percent variance features, from ~1000 -> ~100 features fsp = SelectPercentile(chi2, 10) X_new_10 = fsp.fit_transform(X,y) print("Final 10 Percent Dimensions: {:d} {:d}".format(X_new_10.shape[0], X_new_10.shape[1])) selected_names = fsp.get_support(indices=True) selected_names = selected_names + 1 #data_reduced = sorted_train_data.iloc[:,[0] + selected_names] #Does not put the file_name as the first column. data_trimmed = sorted_train_data.iloc[:,selected_names] data_fnames = pd.DataFrame(sorted_train_data['filename']) data_reduced = data_fnames.join(data_trimmed) data_reduced.to_csv('data/' + final_file_name, index=False) print("Completed reduction in {:s}".format(final_file_name)) return
def main(path): datatrain = get_data(path) vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=6000, strip_accents='unicode') # Calculating weights data_weighted = vectorizer.fit_transform(datatrain.data) # Build feature selection feature_selection = SelectPercentile(f_classif, percentile=20) data_weighted = feature_selection.fit_transform(data_weighted, datatrain['values']) # Train with known data clf = LinearSVC(loss='l2', penalty='l2', dual=False, tol=1e-3) clf.fit(data_weighted, datatrain['values']) # Save training model if not os.path.exists('training'): os.mkdir('training') filename = 'training/{0}.pkl'.format(int(time.time())) joblib.dump( { 'clf': clf, 'vectorizer': vectorizer, 'feature_selection': feature_selection }, filename, compress=9)
def main(): parser = argparse.ArgumentParser(description='Feature Selection') required = parser.add_argument_group('required options') required.add_argument('-x', '--scaledfeaturelist', required=True, help='File containing feature values') required.add_argument('-y', '--targetdata', required=True, help='File containiing target data') required.add_argument('-z', '--fetpercentile', required=True, type=int, help='Percentile to select highest scoring percentage of features') args = parser.parse_args() X = np.loadtxt(args.scaledfeaturelist) Y = np.genfromtxt(args.targetdata,dtype='str') #result = SelectPercentile(f_classif, percentile=args.fetpercentile).fit_transform(X,Y) sel = SelectPercentile(f_classif, percentile=args.fetpercentile) result = sel.fit_transform(X,Y) #selecting features for test programs if os.path.isfile('variancefeatures.txt'): varianceFeature = np.genfromtxt("variancefeatures.txt", dtype='str') featureFromSelectPercentile = sel.get_support(indices=True) featureFileforSelectPercentile = open("featuresToTestPrograms","w") for i in featureFromSelectPercentile: featureFileforSelectPercentile.write(varianceFeature[i]) featureFileforSelectPercentile.write("\n") featureFileforSelectPercentile.close() #remove the variancefeatures as we don't need it anymore rm variancefeatures.txt np.savetxt('featurelist', result, fmt='%.2f', delimiter='\t')
def build_linear_model(X, y, analyzerType): tfv = vectorizer(analyzerType) select = SelectPercentile(score_func=chi2, percentile=15) clf = SVC(C=12.0, kernel='linear') X = tfv.fit_transform(X) X = select.fit_transform(X, y) return (clf.fit(X, y), tfv, select)
def test_selectpercentile_tiebreaking(): """Test if SelectPercentile selects the right n_features in case of ties. """ Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]] y = [1] dummy_score = lambda X, y: (X[0], X[0]) for X in Xs: with warnings.catch_warnings(record=True): sel = SelectPercentile(dummy_score, percentile=34) X1 = sel.fit_transform([X], y) assert_equal(X1.shape[1], 1) assert_best_scores_kept(sel) sel = SelectPercentile(dummy_score, percentile=67) X2 = sel.fit_transform([X], y) assert_equal(X2.shape[1], 2) assert_best_scores_kept(sel)
def feature_selection(self,mode='F'): print 'Feature Selection...' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') X=self.train.copy() y=self.train_label['label'].values.copy() test=self.test.copy() if mode.upper()=='M': mi=mutual_info_classif(train.values,train_label['label'].values) elif mode.upper()=='F': F,pval=f_classif(train.values,train_label['label'].values) elif mode.upper()=='C': chi,pval=chi2(train.values,train_label['label'].values) features=self.train.columns.copy() fs_features=features.copy().tolist() if mode.upper()=='M': fs_V=mi.copy().tolist() elif mode.upper()=='F': fs_V=F.copy().tolist() elif mode.upper()=='C': fs_V=chi.copy().tolist() if mode.upper()=='M': selector=SelectPercentile(mutual_info_classif,percentile=80) elif mode.upper()=='F': selector=SelectPercentile(f_classif,percentile=80) elif mode.upper()=='C': selector=SelectPercentile(chi2,percentile=80) X_new=selector.fit_transform(X,y) selected=selector.get_support() for i in xrange(len(features)): if selected[i]==False: t=features[i] fs_features.remove(t) fs_V=np.array(fs_V) fs_features=np.array(fs_features) self.train=pd.DataFrame(X_new,columns=fs_features.tolist()) self.test=test[fs_features] self.fs_features=fs_features feas=pd.DataFrame() feas['feature']=fs_features print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return X_new,feas
def main(path,filename): #batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12'] batchs = ['patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12'] #batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patronesCircularesByN_2_5','patronesCircularesByN_2_9','patronesCircularesByN_3_9','patronesCircularesByN_5_9','patronesCircularesByN_3_5'] #batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5'] percentil = 20 X = [] y = [] lens = [] load_batch(y,path,'clases',filename) y = [j for i in y for j in i] for batch in batchs: load_batch(X,path,batch,filename) lens.append(len(X[0])) total = [lens[0]] for i in xrange(1,len(lens)): total.append(lens[i]-lens[i-1]) print 'Cantidad de atributos por barch' print total sp = SelectPercentile(chi2,percentil) X_new = sp.fit_transform(X, y) sup = sp.get_support(True) #print sup res = [0]* len(batchs) for i in sup: for j in xrange(0,len(lens)): if i <= lens[j]: res[j] +=1 break porcentajes = [] for i in xrange(0,len(lens)): porcentajes.append((1.0*res[i])/total[i]) print 'Cantidad de variables seleccionas en el'+str(percentil)+'percentil univariado' print res print 'Porcentaje de variables seleccionas en el'+str(percentil)+'percentil univariado' print porcentajes clf = ExtraTreesClassifier() clf = clf.fit(X, y) fi = clf.feature_importances_ res2 = [0]* len(batchs) for i in xrange(0,len(fi)): for j in xrange(0,len(lens)): if i <= lens[j]: res2[j] += fi[i] break print 'Importancia porcentual acumulada de la seleccion multivariada' print res2 porcentajes2 = [] for i in xrange(0,len(lens)): porcentajes2.append((1.0*res2[i])/total[i]) print 'Importancia porcentual promedio por variable de la seleccion multivariada' print porcentajes2
def fTestFeatureSelection(train_files, train_labels, test_files, test_labels): design_matrix, features, _ = vectorizeTrain(train_files, None, 0, False, 0, None) classifier = LogisticRegression() for p in range(10): percentile = 100-p*10 print 'Selecting {0}% of features'.format(percentile) feat_sel = SelectPercentile(f_regression, percentile) X_sel = feat_sel.fit_transform(design_matrix, train_labels) f_inds = feat_sel.get_support(indices=True) print 'Using {0} features'.format(len(f_inds)) classifier.fit(X_sel, train_labels) test(test_files, test_labels, classifier, [features[d] for d in f_inds], None, 0, False, 0, None)
def cross_val_score(clf, data, target, k): shuffle_arr = [] size = len(data) for i in range(size): shuffle_arr.append(i) scores = [] for i in range(0, k): #generate shuffled train and test dataset data_train_raw = [] data_test_raw = [] target_train = [] target_test = [] # seperate shuffled train and test dataset random.shuffle(shuffle_arr) shuffle_train = shuffle_arr[:size - size/k] shuffle_test = shuffle_arr[size-size/k :] for j in shuffle_train: data_train_raw.append(data_total[j]) target_train.append(target[j]) for r in shuffle_test: data_test_raw.append(data_total[r]) target_test.append(target[r]) data_train = data_process(data_train_raw) data_test = data_process(data_test_raw) # transform array of string to counts count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(data_train) # transform counts to frequencies tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) # feature selection select = SelectPercentile(chi2, percentile = 10) X_train_fs = select.fit_transform(X_train_tf, target_train) # train the model clf_train = clf.fit(X_train_fs, target_train) # test the model X_new_counts = count_vect.transform(data_test) X_new_tfidf = tf_transformer.transform(X_new_counts) X_new_fs = select.transform(X_new_tfidf) test_result = clf_train.predict(X_new_fs) scores.append(GetPrecisionRecallF1(test_result, target_test)) #clf_score = clf_train.score(X_new_fs, target_test) #scores.append(clf_score) return scores
def featureSelection(self,X,y): ''' Feature selection recursive feature elimination with Linear SVM. :param: a. X the training matrix. b. y the labels column corresponding the X. :return: a. The mask of top 10% features using. b. The transformed training matrix ''' print np.shape(X) selector = SelectPercentile(chi2, percentile=10) X_new = selector.fit_transform(X, y) return X_new, selector.get_support()
def feature_reduction_percent(percentage, train_data_df, train_labels_df): # TODO: everythong X = train_data_df.iloc[:,1:] y = np.array(train_labels_df.iloc[:,1]) # find the top percent variance features. fsp = SelectPercentile(chi2, percentage) X_reduced = fsp.fit_transform(X,y) selected_names = fsp.get_support(indices=True) selected_names = selected_names + 1 data_trimmed = sorted_train_data.iloc[:,selected_names] data_fnames = pd.DataFrame(sorted_train_data['filename']) data_reduced = data_fnames.join(data_trimmed) data_reduced.to_csv('data/sorted-train-malware-features-asm-50percent.csv', index=False) return
def do_feature_selection(train_instances, test_instances, folds, clf, param_grid, dense, outfile): groups = set(train_instances[0].feature_groups.keys()).intersection(test_instances[0].feature_groups.keys()) X_train, y_train, feature_space = pipe.instances_to_matrix(train_instances, dense = dense, groups = groups) X_test, y_test = test_instances_to_matrix(feature_space, test_instances, dense = dense) all_tpr = [] (chi2values, pval) = chi2(X_train, y_train) feature_indices = [i[0] for i in sorted(enumerate(pval), key=lambda x:x[1])] index_to_name = {v:k for k, v in feature_space.items()} feature_names = [index_to_name[i] for i in feature_indices] print feature_indices[0:200] print feature_names[0:200] for percentile in range(1, 10, 2): t0 = time() ch2 = SelectPercentile(chi2, percentile=percentile) X_train_trans = ch2.fit_transform(X_train, y_train) print("done in %fs" % (time() - t0)) model = get_optimal_model (X_train_trans, y_train, folds, clf, param_grid, 'roc_auc') X_test_trans = ch2.transform(X_test) scores = get_scores(model, X_test_trans) fpr, tpr, thresholds = roc_curve(y_test, scores) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='%d (area = %0.4f)' % (percentile, roc_auc)) print "\n"*4 plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.savefig('feature_selection.png') print()
def fit_predict(trn, dev, args, emb): rng = np.random.RandomState(7) Xtrn = np.array([emb.get_context(i,sent, args['embs'], args['e_context']) for sent in trn for m,(i,w) in zip(sent['ii'],enumerate(sent['ws'])) if m]) Xdev = np.array([emb.get_context(i,sent, args['embs'], args['e_context']) for sent in dev for m,(i,w) in zip(sent['ii'],enumerate(sent['ws'])) if m]) ytrn = np.array([lbl for sent in trn for m,lbl in zip(sent['ii'],sent['ls']) if m]) ydev = np.array([lbl for sent in dev for m,lbl in zip(sent['ii'],sent['ls']) if m]) logging.debug('embs: Xtrn_emb:{} Xdev_emb: {}'.format(Xtrn.shape, Xdev.shape)) if len(args['feats']): feat = Feat(trn, args) dvec = DictVectorizer(sparse=False) # dvec.fit(feat.get_features(i, sent) for sent in trn for i,w in enumerate(sent['ws'])) Xtrn_feat = dvec.fit_transform(feat.get_features(i, sent) for sent in trn for m,(i,w) in zip(sent['ii'],enumerate(sent['ws'])) if m) Xdev_feat = dvec.transform(feat.get_features(i, sent) for sent in dev for m,(i,w) in zip(sent['ii'],enumerate(sent['ws'])) if m) logging.debug('Xtrn_feat shape:{}'.format(Xtrn_feat.shape)) logging.debug('Xdev_feat shape:{}'.format(Xdev_feat.shape)) assert (Xtrn.std(axis=0)==0).sum() == 0 if args['percentile'] < 100: from sklearn.feature_selection import chi2, f_classif, SelectPercentile sel = SelectPercentile(chi2, percentile=args['percentile']) Xtrn_feat, Xdev_feat = sel.fit_transform(Xtrn_feat, ytrn), sel.transform(Xdev_feat) logging.debug('after sel: Xtrnf:{} Xdevf: {}'.format(Xtrn_feat.shape, Xdev_feat.shape)) logging.debug([(fea, score) for score, fea in islice(reversed(sorted(zip(sel.scores_, dvec.feature_names_))), 100)]) Xtrn = np.hstack((Xtrn,Xtrn_feat)) Xdev = np.hstack((Xdev,Xdev_feat)) logging.debug('after feats: Xtrn:{} Xdev: {}'.format(Xtrn.shape, Xdev.shape)) cweights = 'balanced' if args['cweights'] else {0:1,1:1} if args['kerntype'] == 'lin': clf = LinearSVC(C=args['C'], class_weight=cweights, random_state=rng) else: clf = SVC(class_weight=cweights, C=args['C'], kernel=args['kerntype'], gamma=args['kerngamma'], degree=args['kerndegree'], random_state=rng) clf.fit(Xtrn, ytrn) return clf.predict(Xtrn), clf.predict(Xdev)
def call_GridParamSearch_featfilt(X, y) : ''' (def is Currently just a cut & paste from "main".) Calles def GridParamSearch , (which uses randomized CV to find odel param) Used to try different ml models, then get their optimal paramters ''' print("SPARSE (L1) EXT gridparam scores:") # clf = Pipeline([ # ('feature_selection', LinearSVC(penalty="l1", loss='l1',dual=False, class_weight='auto')), # ('classification', ExtraTreesClassifier(n_jobs=3) # )]) 'Sparse; L1 penalized features selection prior to RF fitting/prediction' clf_svm = LinearSVC(penalty="l1", loss='l2', dual=False, class_weight='auto') clf_logit = LogisticRegression(penalty="l1", dual=False, class_weight='auto') 'http://scikit-learn.org/0.13/auto_examples/plot_feature_selection.html' print('Original features matrix:') print(X.shape) # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 20% most significant features # selector = SelectPercentile(f_classif, percentile=20) selector = SelectPercentile(chi2, percentile=20) X_anova = selector.fit_transform(X, y) print( 'New (2 f_classif) Using statistical feature selection: features matrix is:') print(X_anova.shape) # lda = LDA(n_components=10) # X_lda = lda.fit_transform(X, y) # print('New LDA filtered features matrix:') # print(X_lda.shape) X_svm = clf_svm.fit_transform(X, y) #Get Sparse feature selections.. # print(clf.feature_importances_ ) print('New sparse (SVM filtered) features matrix:') print(X_svm.shape) print("Res of SVM fitting of (F scores filtered =2) for more feature selection:") X_doubleFilt_svm_f = clf_svm.fit_transform(X_anova, y) print(X_doubleFilt_svm_f.shape) print("param search on sparse features matrix") GridParamSearch(param_dist=Tree_param_dist, clf=clf_EXT, X=X_svm, y=y)
def featureSelectionProcess(X,Y,featureSelection): print "feature selection process: "+str(featureSelection) print "before feature selection. shape of X"+str(X[0].shape) if featureSelection == "linearSVM": X_new = LinearSVC(C=0.01, penalty="l1", dual=False).fit_transform(X, Y) print "after feature selection. shape of X"+str(X_new[0].shape) elif featureSelection == "SelectKBest": X_new = SelectKBest(chi2, k=6).fit_transform(X, Y) print "after feature selection. shape of X"+str(X_new[0].shape) elif featureSelection == "SelectKPercentile": selector = SelectPercentile(f_classif, percentile=30) X_new = selector.fit_transform(X, Y) print "after feature selection. shape of X"+str(X_new[0].shape) elif featureSelection == "TreeBased": clf = ExtraTreesClassifier() X_new = clf.fit(X, Y).transform(X) print "after feature selection. shape of X"+str(X_new[0].shape) elif featureSelection == "Recursive": svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=5, step=1) X_new = rfe.fit(X, Y).transform(X) print "after feature selection. shape of X"+str(X_new[0].shape)
class AnovaPercentileStep(SklearnStep): def __init__(self, percentile): super(AnovaPercentileStep, self).__init__() self._percentile = percentile def fit_transform(self): self._model = SelectPercentile(f_classif, self._percentile) x, y = load_svmlight(self.input_path) x = self._model.fit_transform(x, y) save_svmlight(x, y, self._output_path) def transform(self, x=None): if x is None: x, y = load_svmlight(self._test_input_path) x = self._model.transform(x) save_svmlight(x, y, self._test_output_path) else: transformed_x = self._model.transform(x) return transformed_x def get_param(self): return {'percentile': self._percentile}
def load_data(self): # preprocessing train data df = pd.read_csv(BASE_DIR + INPUT_TRAIN) X = df.values.copy() np.random.shuffle(X) self.ids, X, labels = X[:, 0], X[:, 1:-1].astype(np.float32), X[:, -1] self.encoder = LabelEncoder() self.y_true = self.encoder.fit_transform(labels).astype(np.int32) scaler = MinMaxScaler() self.train = scaler.fit_transform(X) selector = SelectPercentile(f_classif, percentile=20) self.train = selector.fit_transform(X,self.y_true) # preprocessing test data df = pd.read_csv(BASE_DIR + INPUT_TEST) X = df.values.copy() X, self.idx = X[:, 1:].astype(np.float32), X[:, 0].astype(str) self.test = scaler.transform(X) self.test = selector.transform(X) self.num_class = len(self.encoder.classes_) self.n_samples = len(self.y_true) self.y_pred = np.zeros( (self.n_samples, self.num_class) ) return None
def main(path): datatrain = get_data(path) vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=6000, strip_accents='unicode') # Calculating weights data_weighted = vectorizer.fit_transform(datatrain.data) # Build feature selection feature_selection = SelectPercentile(f_classif, percentile=20) data_weighted = feature_selection.fit_transform(data_weighted, datatrain['values']) # Train with known data clf = LinearSVC(loss='l2', penalty='l2', dual=False, tol=1e-3) clf.fit(data_weighted, datatrain['values']) # Save training model if not os.path.exists('training'): os.mkdir('training') filename = 'training/{0}.pkl'.format(int(time.time())) joblib.dump({'clf': clf, 'vectorizer': vectorizer, 'feature_selection': feature_selection}, filename, compress=9)
print " % | Precision | Recall | Features" print "="*38; features_score = { } # feature: score best_results = { 'precision' : 0.0, 'recall' : 0.0, 'features' : [], 'percent' : 0.0 } for percent in range(0, 101, 5): if percent == 0: continue fs = SelectPercentile(f_classif, percentile=percent) features_transformed = fs.fit_transform(features, labels) best_features = [] counter = 0 for idx, score in sorted(enumerate(fs.scores_), key=lambda score: score[1], reverse=True): if len(features_score.keys()) < len(features_list): features_score[features_list[idx+1]] = score counter = counter + 1 if counter > len(features_transformed[0]): continue best_features.append(features_list[idx+1]) ## DECISION TREE if len(best_features) < 2: continue; # can't have less than 2 features for a decision tree
testfeaturevectors = varthresh.transform(testfeaturevectors) """ #classifiers = [BernoulliNB(), DecisionTreeClassifier(), LogisticRegression(), OneVsRestClassifier(LinearSVC(random_state=0)) ] #classifier_names = ["BernoulliNB", "DecisionTreeClassifier", "MaxEnt", "SVM"] #classifiers = [LogisticRegression(), OneVsRestClassifier(LinearSVC(random_state=0)) ] #classifier_names = ["MaxEnt", "SVM"] classifiers = [LogisticRegression(), OneVsRestClassifier(LinearSVC(random_state=0)) ] classifier_names = ["MaxEnt", "SVM"] allscores = list() for keep in range(3, 30, 3): selector = SelectPercentile(chi2, keep) selected_trainfeaturevectors = selector.fit_transform(trainfeaturevectors, trainLabels) selected_testfeaturevectors = selector.transform(testfeaturevectors) scores = list() for c in range(len(classifiers)): classifier = classifiers[c] classifier.fit(selected_trainfeaturevectors, trainLabels) scores.append((classifier.score(selected_testfeaturevectors, testLabels), keep, classifier_names[c], c)) print(str(keep)+"\t"+"\t".join(str(s[0]) for s in scores)) allscores += scores print("saving the best classifier:") bestsettings = max(allscores)
def reduce_feature_set(feature_set_file, train_label_file, token_file, reduced_set_file, temp_train_labels): # Use chi2 tests to determine the 10% best features see (mmmc/feature-reduction-call-graphs.ipynb). # Ok, so we still have 100000+ features even after severely reducing the function name lengths. # This is a problem. Having to process such a huge sparse matrix requires a lot of memory. # Solution 1: rent an AWS server with plenty-o-ram. (costs money and requires high bandwidth for file transfer) # Solution 2: buy more RAM for my linux box. (costs money) # Solution 3: break the sparse matrix into smaller chunks and process individually. (Ok) # Solution 4: try the pandas sparse matrix data structure. (too slow) # -> Solution 3: slice the matrix into smaller chunks for processing. # the pandas spare matrix still takes too long, break up into 10 different feature sets and try again. # Procedure: # 1. Open the PE header feature file. # 2. Open the PE header token file and get the number of column names. # 3. Divide the number of columns by 10 to get the column subset length. # 4. Load the malware label set. # 5. Use pandas to load and sort each column subset. # 6. Do the chi2 tests to reduce each column subset to 10 percent best features. # 7. Recombine the column subsets. # 8. Perform the chi2 test again on the combined reduced feature set. # 9. Write out the final reduced feature set to a csv file. # Open PE header token file and get a list of token names. hdr_pd = pd.read_csv('data/' + token_file, na_filter=False) # Do not do NaN filtering or we will get floats instead of text. token_list = list(hdr_pd['token_name']) token_list_len = len(token_list) for idx, token in enumerate(token_list): # Clamp the token name length and demangle C++ names, they are annoying. #token = token.replace('@','').replace('$','').replace('?','') if len(token) > 32: token_list[idx] = token[:32] else: token_list[idx] = token # Load training labels sorted_train_labels = pd.read_csv("data/" + train_label_file) #sorted_train_labels.head() # Load and sort the malware sample names. sample_names = pd.read_csv(feature_set_file, usecols = [0], na_filter=False) sorted_sample_names = sample_names.sort('file_name') # Now get the labels of the PE malware samples from the label set. counter = 0 y = [] #train_names = sorted_train_labels['family_label'] for fname in sorted_sample_names['file_name']: counter += 1 if counter % 10000 == 1: print("Appending {:d} -> {:s}".format(counter, fname)) for idx, fname2 in enumerate(sorted_train_labels['file_name']): if (fname2 == fname): y.append(sorted_train_labels.iloc[idx, 4]) # Append the family class label. break ############################### # Write out the PE/COFF sample train labels for later use and validation. fop = open('data/' + temp_train_labels, 'w') fop.writelines("\n".join(str(x) for x in y)) fop.close() ############################### # Load column subset and sort, then # Perform chi2 test to get 10% best features. onetenth = int(token_list_len / 10) startidx = 1 # skip the filename column endidx = onetenth for idx in range(0,10): print("Processing column set {:d} -> {:d}".format(startidx, endidx)) column_numbers = [ 0 ] + list(range(startidx, endidx, 1)) feature_subset = pd.read_csv(feature_set_file, usecols = column_numbers) # Sort the feature subset on file_name column. sorted_feature_subset = feature_subset.sort('file_name') X = sorted_feature_subset.iloc[:,1:] # skip the filename, get the family class label for this feature subset. # Find the top 10 percent variance features. print("Sorted feature subset - slice {:d} of 10.".format(idx)) print("Subset shape: {:d} {:d}".format(X.shape[0], X.shape[1])) print("Length of y: {:d}".format(len(y))) #sorted_feature_subset.head() # Now select the 10% best features for this feature subset. # Try to make the subset file sizes smaller. fsp = SelectPercentile(chi2, 10) X_new_10 = fsp.fit_transform(X,y) selected_names = fsp.get_support(indices=True) selected_names = selected_names + 1 # the column name indices start at 0 so add 1 to all. data_trimmed = sorted_feature_subset.iloc[:,selected_names] data_fnames = pd.DataFrame(sorted_feature_subset['file_name']) data_reduced = data_fnames.join(data_trimmed) # Write to file as we do not have enough memory. filename = "data/pe-header-temp-" + str(idx) + "-10perc.csv" data_reduced.to_csv(filename, index=False) # TEST AND VALIDATION ONLY. ############################################ #out_subset = sorted_feature_subset.iloc[:,0:2] #out_subset.to_csv(filename, index=False) print("Writing reduced feature file: {:s}".format(filename)) ############################################ startidx = endidx endidx += onetenth return
def main(): print 'loading data' alltext = [] traindata = p.read_table('train.tsv').replace('?',0) traindata['alchemy_category'] = traindata.groupby('alchemy_category').grouper.group_info[0] traindata['alchemy_category_score'] = traindata['alchemy_category_score'].astype(float) traindata = outlier(np.array(traindata),24) print 'no of rows after outlier removal =',len(traindata) # traindata = list(np.array(p.read_table('train.tsv'))[:,2]) testlabels = list(np.array(p.read_table('test.tsv'))[:,1]) testdata = list(np.array(p.read_table('test.tsv'))[:,2]) trainlabels = traindata[:,-1] traindata = list(traindata[:,2]) alltext.extend(traindata) alltext.extend(testdata) # print len(alltext) trainlabels = np.array(trainlabels).astype(int) testlabels = np.array(testlabels) alltext = np.array(alltext) print 'fitting pipeline and transforming data' vect = TfidfVectorizer(stop_words='english',min_df=3,max_df=1.0, strip_accents='unicode',analyzer='word',ngram_range=(1,2), use_idf=1,smooth_idf=1,sublinear_tf=1,tokenizer=LemmaTokenizer()) alltextMatrix = vect.fit_transform(alltext) traintext = alltextMatrix[:len(trainlabels)] testtext = alltextMatrix[len(trainlabels):] print 'applying chi test' kf = StratifiedKFold(trainlabels, n_folds=5, indices=True) kToTest = [1,3,5,8,10,15,20] alphaToTest = [0.0001,0.001,0.01,0.1,0.5,1.0] results = np.zeros((len(kToTest),len(alphaToTest))) for train,test in kf: X_train, X_cv, y_train, y_cv = traintext[train],traintext[test],trainlabels[train],trainlabels[test] for i in range(len(kToTest)): FS=SelectPercentile(score_func=chi2,percentile=kToTest[i]) X_FS_train = FS.fit_transform(X_train,y_train) X_FS_cv = FS.transform(X_cv) for j in range(len(alphaToTest)): model = lm.LogisticRegression(penalty='l2', dual=True, tol=alphaToTest[j], C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) model.fit(X_FS_train,y_train) results[i][j] += metrics.roc_auc_score(y_cv,model.predict_proba(X_FS_cv)[:,1]) k,alpha = np.nonzero(results == results.max()) # print 'k = %d alpha = %d'%(k[0],alpha[0]) FS=SelectPercentile(score_func=chi2,percentile=kToTest[k[0]]) X_train = FS.fit_transform(traintext,trainlabels) X_test = FS.transform(testtext) model = lm.LogisticRegression(penalty='l2', dual=True, tol=alphaToTest[alpha[0]], C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) print "20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X_train, trainlabels, cv=20, scoring='roc_auc')) model.fit(X_train,trainlabels) outputs = model.predict_proba(X_test)[:,1] final = scipy.vstack((testlabels.T.astype(int),outputs.T.astype(float))).T file_object = csv.writer(open('Solution.csv', "wb")) file_object.writerow(['urlid','label']) for i in final: file_object.writerow(i)
def buildClassifierWithSplit(classes, examples, featureChoice=None): #triggerClassifier = buildTriggerClassifier(positiveTriggerExamples, negativeTriggerExamples) print "Generating training and test set" random.seed(10) exampleIDsGroupedByClass = defaultdict(list) for i,c in enumerate(classes): exampleIDsGroupedByClass[c].append(i) allClasses = sorted(list(set(classes))) trainingIndices = [] for c in allClasses: trainingSetSize = int(0.7 * len(exampleIDsGroupedByClass[c])) tmpTrainingIndices = random.sample(exampleIDsGroupedByClass[c], trainingSetSize) trainingIndices = trainingIndices + tmpTrainingIndices testingIndices = [ i for i in range(len(classes)) if not i in trainingIndices ] trainingClasses = [ classes[i] for i in trainingIndices ] trainingExamples = [ examples[i] for i in trainingIndices ] testingClasses = [ classes[i] for i in testingIndices ] testingExamples = [ examples[i] for i in testingIndices ] assert len(trainingClasses) == len(trainingExamples) assert len(testingClasses) == len(testingExamples) print "-"*30 print len(trainingClasses),len(testingClasses) for c in allClasses: trainingCount = sum ( [ i==c for i in trainingClasses ] ) testingCount = sum ( [ i==c for i in testingClasses ] ) print c, trainingCount,testingCount for c in allClasses: assert c in trainingClasses, 'Class %d should be represented in training set (but is not). Need more data!' % c assert c in testingClasses, 'Class %d should be represented in testing set (but is not). Need more data!' % c print "Training set: %d examples (%d positive and %d negative)" % (len(trainingClasses),sum(trainingClasses),len(trainingClasses)-sum(trainingClasses)) print "Testing set: %d examples (%d positive and %d negative)" % (len(testingClasses),sum(testingClasses),len(testingClasses)-sum(testingClasses)) print "Starting vectorizer..." vectorizer = Vectorizer(trainingClasses,trainingExamples, featureChoice) trainingVectors = vectorizer.getTrainingVectors() print "Training vectors of size:", trainingVectors.shape #print trainingClasses #print trainingVectors #for c,example in zip(trainingClasses,trainingExamples): # s = [ example.sentences[sentenceid].tokens[loc] for sentenceid,locs in example.arguments for loc in locs ] # print ",".join(s) perc = 10 weight = 10 doTrim = False if doTrim: print "Trimming training vectors..." from sklearn.feature_selection import SelectKBest,SelectPercentile,chi2 #featureSelector = SelectKBest(chi2, k=100) featureSelector = SelectPercentile(chi2,perc) trainingVectorsTrimmed = featureSelector.fit_transform(trainingVectors, trainingClasses) print "Trimmed training vectors of size:", trainingVectorsTrimmed.shape else: trainingVectorsTrimmed = trainingVectors featureSelector = None print "Creating SVM..." clf = svm.SVC(kernel='linear', class_weight={1: weight}) #clf = svm.SVC() #print "Creating MultinomialNB..." #from sklearn.naive_bayes import MultinomialNB #clf = MultinomialNB(fit_prior=True) print "Fitting classifier to training data..." clf.fit(trainingVectorsTrimmed, trainingClasses) print "Vectorizing test data..." testingVectors = vectorizer.vectorize(testingExamples) if doTrim: print "Trimming test data..." testingVectorsTrimmed = featureSelector.transform(testingVectors) else: testingVectorsTrimmed = testingVectors print "Making predictions on test data..." predictions = clf.predict(testingVectorsTrimmed) print "perc=%d, weight=%d, doTrim=%s" % (perc,weight,doTrim) from sklearn.metrics import confusion_matrix print confusion_matrix(testingClasses, predictions) from sklearn.metrics import f1_score,precision_score,recall_score,classification_report print "f1_score: ", f1_score(testingClasses, predictions) print "precision_score: ", precision_score(testingClasses, predictions) print "recall_score: ", recall_score(testingClasses, predictions) #print classification_report(testingClasses,predictions) print "Fitting to entire training dataset" clf = svm.SVC(kernel='linear', class_weight={1: weight}) clf.fit(vstack([trainingVectorsTrimmed,testingVectorsTrimmed]), trainingClasses+testingClasses) return vectorizer,featureSelector,clf
def build_linear_model(X, y): select = SelectPercentile(score_func=chi2, percentile=20) clf = SVC(C=10.0, kernel='linear', probability=True) X = select.fit_transform(X, y) return (clf.fit(X, y), select)
from sklearn.feature_selection import SelectPercentile, f_classif #################################### # load data from from kaggle files - csv np_train = np.genfromtxt('data/train.csv', delimiter=',', skip_header= True, dtype='uint8') #np_test = np.genfromtxt('data/test.csv', delimiter=',', skip_header= True, dtype='uint8') n=6 skf = StratifiedKFold(np_train[:,0].ravel(), n_folds=3, random_state=3476) predictions=np.zeros_like(np_train[:,0]) for train_index, test_index in skf: print time.ctime() # a bit of feature selection fscore = SelectPercentile(f_classif, percentile = 50) Xtrain = np.copy(fscore.fit_transform(np_train[train_index, 1:], np_train[train_index, 0])) Xtest = np.copy(fscore.transform(np_train[test_index, 1:])) # define knn knn_clf = KNeighborsClassifier(n_neighbors=6, weights='distance', metric='cosine', algorithm='brute') knn_clf.fit(Xtrain, np_train[train_index, 0]) print('here') # fitting - this takes long - cca 15min predictions[test_index] = knn_clf.predict(Xtest) accu = accuracy_score(np_train[test_index, 0], predictions[test_index]) print("the accuracy of kNN is : %f" % accu) accu = accuracy_score(np_train[:, 0], predictions) print("the total valudation accuracy of kNN is : %f" % accu)
fileName = r'\trainingSetFeatures.csv' # filePath = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap' filePath = str(input('Input DIRRectory containing TrainingData csv ')) ## features, labels, lb_encoder,featureNames = load_data(filename, 'file') features, labels, lb_encoder,featureNames = load_data(filePath+fileName, 'file') X, y = features, labels print('len(set(y)',len(set(y))) print(X.shape,"X = samples, features") scale = StandardScaler(copy=False) X = scale.fit_transform(X) FD = SelectFdr(alpha=0.0005) FD_K = SelectPercentile(percentile=70) X = FD.fit_transform(X,y) print(X.shape,"X post FDR alpha filter") X_FD = FD_K.fit_transform(X,y) print(X_FD.shape,"X post FDR+K-best alpha filter") print("\n BASE X models: \n") ModelParam_GridSearch(X,y,cv=Kcv) ''' pca = PCA(n_components='mle') X_PCA = pca.fit_transform(X) print(X_PCA.shape,"X - PCA,mle") ModelParam_GridSearch(X_PCA,y,cv=Kcv) '''