class TfIdf(Feature): def __init__(self): self.kbest = None self.vect = None self.truncated = None self.normalizer = None def train(self, reviews, labels): self.vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english') reviews_text = [' '.join(list(chain.from_iterable(review))) for review in reviews] tfidf_matrix = self.vect.fit_transform(reviews_text).toarray() self.truncated = TruncatedSVD(n_components=50) self.truncated.fit(tfidf_matrix, labels) trunc = self.truncated.transform(tfidf_matrix) self.normalizer = Normalizer() self.normalizer.fit(trunc) self.kbest = SelectKBest(f_classif, k=5) self.kbest.fit(self.normalizer.transform(trunc), labels) def score(self, data): reviews_text = ' '.join(list(chain.from_iterable(data))) tfidf_matrix = self.vect.transform([reviews_text]).toarray() trunc = self.truncated.transform(tfidf_matrix) return tuple(self.kbest.transform(self.normalizer.transform(trunc))[0, :])
def anova_best_features(X, y, top = 30): kbest = SelectKBest(f_classif) kbest.fit(X, y) feat_imp = pd.Series(kbest.scores_, index=X.columns) feat_imp.sort_values(inplace=True) ax = feat_imp.tail(top).plot(kind='barh', figsize=(10,7), title='Feature importance (f_classif)') return feat_imp
def SelectFeatures (TrainFVs, TrainLabels, FeatuesToSelect, FSAlgo, DispSelectedFeatures=False, Vocab=None): SelectedFeaturesTransformer = SelectKBest(FSAlgo, k = FeatuesToSelect) SelectedFeaturesTransformer.fit(TrainFVs, TrainLabels) FeautureImportances = np.array(SelectedFeaturesTransformer.scores_ ) print 'total # of features: ', len (FeautureImportances) raw_input() TopFeatureIndices = FeautureImportances.argsort()[-FeatuesToSelect:][::-1] print FeautureImportances # print SelectedFeaturesTransformer.pvalues_ # TrainFVs = SelectedFeaturesTransformer.fit_transform(TrainFVs, TrainLabels) # TestFVs = SelectedFeaturesTransformer.transform(TestFVs) # logger.debug ("after feature selection the shape of \ # training and test arrays %s %s", TrainFVs.shape, TestFVs.shape) if DispSelectedFeatures: # SelectedFeatures = SelectedFeaturesTransformer.get_support(indices=True) # for F in SelectedFeatures: # print AllFeatures[F] for FIndex in TopFeatureIndices: print Vocab[FIndex] return [Vocab[FIndex] for FIndex in TopFeatureIndices]
def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True): ''' Gets best features using chosen method (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr), then prints top K features' names (from featNames). If reduceMatrix = True, then also returns X reduced to the K best features. Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'.. Note, that effectiveyl, Any scikit learn method could be used, if correctly imported.. ''' #est = method() ''' Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented). Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv" ''' features, labels, lb_encoder,featureNames = load_data(filename) X, y = features, labels # change the names as ints back to strings class_names=lb_encoder.inverse_transform(y) print("Data and labels imported. PreFilter Feature matrix shape:") print(X.shape) selectK = SelectKBest(k=kbest) selectK.fit(X,y) selectK_mask=selectK.get_support() K_featnames = featureNames[selectK_mask] print('X After K filter:',X.shape) print("K_featnames: %s" %(K_featnames)) if reduceMatrix ==True : Reduced_df = pd.read_csv(filename, index_col=0) Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]] Reduced_df.to_csv('REDUCED_Feat.csv') print('Saved to REDUCED_Feat.csv') return Reduced_df
def get_k_best(dictionary, features_list, k): """ runs scikit-learn's SelectKBest feature selection returning: {feature:score} """ data = featureFormat(dictionary, features_list) labels, features = targetFeatureSplit(data) k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ pairs = zip(features_list[1:], scores) #combined scores and features into a pandas dataframe then sort k_best_features = pd.DataFrame(pairs,columns = ['feature','score']) k_best_features = k_best_features.sort('score',ascending = False) #merge with null counts df_nan_counts = get_nan_counts(dictionary) k_best_features = pd.merge(k_best_features,df_nan_counts,on= 'feature') #eliminate infinite values k_best_features = k_best_features[np.isinf(k_best_features.score)==False] print 'Feature Selection by k_best_features\n' print "{0} best features in descending order: {1}\n".format(k, k_best_features.feature.values[:k]) print '{0}\n'.format(k_best_features[:k]) return k_best_features[:k]
def splitIntoTrainingAndValidation(A, B): data1 = shuffle(sourceSets[A]) # Note this is a random shuffle, that's data2 = shuffle(sourceSets[B]) # why we need many iterations freqM = np.minimum(freqs[A], freqs[B]) freq1tr = np.round(freqM * 0.8) # Randomly selected 80% for the training set, freq1va = freqM - freq1tr # and the remaining 20% for the validation set freq2tr = np.copy(freq1tr) freq2va = np.copy(freq1va) trainingSetSize = int(sum(freq1tr)) # 1/2 size actually validatnSetSize = int(sum(freq1va)) testSet1size = len(data1) - trainingSetSize - validatnSetSize testSet2size = len(data2) - trainingSetSize - validatnSetSize X = np.zeros((trainingSetSize*2, numFeatures)) Xv = np.zeros((validatnSetSize*2, numFeatures)) Xt = np.zeros((testSet1size+testSet2size, numFeatures)) y = np.ravel([([0]*trainingSetSize) + ([1]*trainingSetSize)]) yv = np.ravel([([0]*validatnSetSize) + ([1]*validatnSetSize)]) yt = np.ravel([([0]*testSet1size) + ([1]*testSet2size)]) trnIdx = vldIdx = tstIdx = 0 for item in data1: year = item[0] if freq1tr[year] > 0: X[trnIdx], trnIdx, freq1tr[year] = item[1:], trnIdx+1, freq1tr[year]-1 elif freq1va[year] > 0: Xv[vldIdx], vldIdx, freq1va[year] = item[1:], vldIdx+1, freq1va[year]-1 else: Xt[tstIdx], tstIdx = item[1:], tstIdx+1 assert trnIdx==trainingSetSize and vldIdx==validatnSetSize and tstIdx==testSet1size for item in data2: year = item[0] if freq2tr[year] > 0: X[trnIdx], trnIdx, freq2tr[year] = item[1:], trnIdx+1, freq2tr[year]-1 elif freq2va[year] > 0: Xv[vldIdx], vldIdx, freq2va[year] = item[1:], vldIdx+1, freq2va[year]-1 else: Xt[tstIdx], tstIdx = item[1:], tstIdx+1 assert trnIdx==trainingSetSize*2 and vldIdx==validatnSetSize*2 and tstIdx==testSet1size+testSet2size X, y = shuffle(X, y) # Just in case... perhaps no reason to shuffle again here? fs = SelectKBest(f_classif, k = numFeatures) # TODO: try other feature selection methods? fs.fit(np.concatenate((X, Xv)), np.concatenate((y, yv))) return X, y, Xv, yv, Xt, yt, testSet1size, testSet2size, fs.scores_
def _SelectKBest(self, X, y): print('Selecting K Best from whole image') from sklearn.feature_selection import SelectKBest, f_classif # ### Define the dimension reduction to be used. # Here we use a classical univariate feature selection based on F-test, # namely Anova. The number of features to be selected is set to 784 feature_selection = SelectKBest(f_classif, k=self.k_features) feature_selection.fit(X, y) scores = f_classif(X, y)[0] mask_k_best = np.zeros(scores.shape, dtype=bool) mask_k_best[np.argsort(scores, kind="mergesort")[-self.k_features:]]\ = 1 import nibabel mask_brain_img = nibabel.load(self.mask_non_brain).get_data() mask_brain = mask_brain_img.flatten().astype(bool) roi = np.zeros(mask_brain.flatten().shape) roi[mask_brain] = mask_k_best roi = roi.reshape(mask_brain_img.shape) img = nibabel.Nifti1Image(roi, np.eye(4)) img.to_filename('/tmp/best.nii.gz') print('SelectKBest data reduction from: %s' % str(X.shape)) X = feature_selection.transform(X) print('SelectKBest data reduction to: %s' % str(X.shape)) self.feature_reduction_method = feature_selection return X
def selectBestFeatures(data_dict, features_list, k, print_result): ''' Using SelectKBest, find k best features. param: data_dict : data set features_list : dlist of feature k : number of max feature return: best_features : list of select features ''' best_features = {} #data = featureFormat(data_dict, features_list) #labels, features = targetFeatureSplit(data) labels, features = getFeaturesAndLabels(data_dict, features_list) k_best = SelectKBest(k=k) k_best.fit(features, labels) unsorted_pair_list = zip(features_list[1:], k_best.scores_) sorted_pair_list = sorted(unsorted_pair_list, key = lambda x: x[1], reverse = True) k_features = [pair[0] for pair in sorted_pair_list] k_scores = [pair[1] for pair in sorted_pair_list] best_features['feature'] = k_features[:k] best_features['score'] = k_scores[:k] if print_result: #print final result print "--- Selet K Best Score ---" print pd.DataFrame(best_features) return best_features['feature']
def fit(self, X, y): support = range(X.shape[1]) X0 = X while X.shape[1] > self.n_features: new_size = max(int(X.shape[1]*self.step), self.n_features) kbest = SelectKBest(f_regression, k=new_size) kbest.fit(X, y) score1 = kbest.scores_ self.estimator.fit(X, y) score2 = abs(self.estimator.coef_) score1 = score1 / max(score1) score2 = (score2 / max(score2))**2 score = (1 - self.p) * score1 + self.p * score2 coefs = zip(score, support) coefs = sorted(coefs, key=lambda (a,b): a, reverse=True) support = [b for (a, b) in coefs[:new_size]] X = X0[:, support] self.estimator.fit(X, y) self.support = support
def PerformFeatureSelection(adult_train, features, Output): selector = SelectKBest(f_classif, k=5) selector.fit(adult_train[features], adult_train[Output]) scores = -numpy.log10(selector.pvalues_) plt.bar(range(len(features)), scores) plt.xticks(range(len(features)), features, rotation='vertical') plt.show()
def select_parameter(): selector = SelectKBest(f_classif, k=5) selector.fit(titanic[predictors], titanic["Survived"]) scores = -numpy.log10(selector.pvalues_) plt.bar(range(len(predictors)), scores) plt.xticks(range(len(predictors)), predictors, rotation='vertical') plt.show()
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def discriminatory_features(): print 'Finding most discriminatory features...' NUM_FEATURES = 10 all_points = class1_song_points + class2_song_points true_labels = [0]*len(class1_song_points)+[1]*len(class2_song_points) feature_indices = [] for i in range(NUM_FEATURES): selector = SelectKBest(chi2, i+1) selector.fit(all_points, true_labels) new_indices = selector.get_support(indices=True) for index in new_indices: if index not in feature_indices: feature_indices.append(index) feature_descriptions = [] for index in feature_indices: feature = feature_names[index] if feature.lower() in wsj_mapping.keys(): key = wsj_mapping[feature.lower()] description = key + ': ' + wsj_to_description[key] elif feature in word_vocab: description = 'The word: ' + feature else: description = feature feature_descriptions.append(description) return jsonify(features=feature_descriptions)
def get_k_best(df, features_list, k): """ runs scikit-learn's SelectKBest feature selection returns dict where keys=features, values=scores """ # feature, label = feature_format_scale(data_dict, features_list) from poi_dataprocess import * from feature_format import featureFormat, targetFeatureSplit data_dict_new = df[features_list].T.to_dict() data = featureFormat(data_dict_new, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) # df = df[features_list] # features = df.drop('poi', axis=1)#.astype(float) # labels = df['poi'] from sklearn import preprocessing scaler = preprocessing.MinMaxScaler() features = scaler.fit_transform(features) from sklearn.feature_selection import SelectKBest k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ unsorted_pairs = zip(features_list[1:], scores) sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) k_best_features = dict(sorted_pairs[:k]) return k_best_features
def __anovaImportance(self, features, labels, features_list): sel = SelectKBest(f_classif, k=2) sel.fit(features, labels) sortIndexes = sel.scores_.argsort()[::-1] features_rank = np.array(features_list[1:])[sortIndexes] print "anova f test importance rank: ", features_rank return features_rank
def get_k_best_features(data_dict, features_list, k): """ runs scikit-learn's SelectKBest feature selection to get k best features Args: data_dict: data dictionary for enron feature_list: a list of features with first feature as target label k: Number of best features which need to be selected Returns: returns a list of k best features and list of lists where inner list's first element is feature and the second element is feature score """ data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ unsorted_pairs = zip(features_list[1:], scores) sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) k_best_features = dict(sorted_pairs[:k]) return k_best_features.keys(), map(list, sorted_pairs)
def k_best_feature_selection(labels, features, features_list): """ Identifies the best features using SelectKBest feature selection labels = target list as returned by the targetFeatureSplit script features = features list as returned by the targetFeatureSplit script features_list = list of the features to be assessed """ from sklearn.feature_selection import SelectKBest k_best = SelectKBest(k = 10) k_best.fit(features, labels) scores = k_best.scores_ features_list = features_list[1:] feature_scores = zip(features_list, scores) feature_scores = sorted(feature_scores, key = lambda x: x[1]) feature_scores = feature_scores[::-1] print feature_scores print "Top 10 features identifed using SelectKBest:" i = 1 while i < 11: print " ", i, "-", feature_scores[i-1] i += 1
def test_mutual_info_classif(): X, y = make_classification( n_samples=100, n_features=5, n_informative=1, n_redundant=1, n_repeated=0, n_classes=2, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_classif, k=2) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_classif, percentile=40) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def select_k_best(data_dict, features_list, k): # Create dataset from feature list data = featureFormat(data_dict, features_list) # Split dataset into labels and features labels, features = targetFeatureSplit(data) # Create Min/Max Scaler scaler = preprocessing.MinMaxScaler() # Scale Features features = scaler.fit_transform(features) # Create k_best feature selection k_best = SelectKBest(k=k) # Fit k_best k_best.fit(features, labels) # Get k_best scores scores = k_best.scores_ # Create list with features and scores unsorted_pairs = zip(features_list[1:], scores) # Sort list sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) # Create dict if k == "all": k_best_features = dict(sorted_pairs) else: k_best_features = dict(sorted_pairs[:k]) return k_best_features
def choseFeature(TrainX, TrainY, TestX): cF = SelectKBest(chi2, k=100) cF.fit(TrainX, TrainY) check = cF.get_support() newTrainX = cF.transform(TrainX) newTestX = cF.transform(TestX) return (newTrainX, newTestX)
def features_importance(features_train, labels_train, feature_list): X=SelectKBest() X.fit(features_train, labels_train) Scores=X.scores_ Pvalues=X.pvalues_ index=feature_list[1:] return pd.DataFrame({'Scores': Scores,'Pvalues': Pvalues},index=index)
class FeatureSelector(BaseEstimator, TransformerMixin): def __init__(self, score_func=None, percentile=None, k_best=None): self.score_func = score_func self.percentile = percentile self.k_best = k_best self.selector = None def fit(self, x, y): if self.k_best is None and self.percentile is not None: self.selector = SelectPercentile(score_func=self.score_func, percentile=self.percentile) self.selector.fit(x, y) return self elif self.k_best is not None and self.percentile is None: self.selector = SelectKBest(score_func=self.score_func, k=self.k_best) self.selector.fit(x, y) return self else: raise ValueError("You should select between percentile or # best features") def transform(self, x): # print "# Features reduced from {} to {}".format(x.columns.shape[0],\ # x.columns[self.selector.get_support()].values.shape[0]) x_transformed = pd.DataFrame(data=self.selector.transform(x), columns=x.columns[self.selector.get_support()], index=x.index) return x_transformed
def test_mutual_info_regression(): X, y = make_regression(n_samples=100, n_features=10, n_informative=2, shuffle=False, random_state=0, noise=10) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_regression, k=2) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect( mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_regression, percentile=20) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile', param=20).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(10) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def select_k_best_features(dataset, features_list, k): """ For E+F dataset, select k best features based on SelectKBest from sklearn.feature_selection Input: dataset: data in dictionary format features_list: the full list of features to selection from k: the number of features to keep Return: the list of length of k+1 with the first element as 'poi' and other k best features """ labels_train, __, features_train, __ = \ test_training_stratified_split(dataset, features_list) k_best = SelectKBest(k=k) k_best.fit(features_train, labels_train) impt_unsorted = zip(features_list[1:], k_best.scores_) impt_sorted = list(sorted(impt_unsorted, key=lambda x: x[1], reverse=True)) k_best_features = [elem[0] for elem in impt_sorted][:k] print k, "best features:" print k_best_features return ['poi'] + k_best_features
def find_features(dataset, features, target): selector = SelectKBest(f_classif, k=5) selector.fit(dataset[features], dataset[target[0]]) scores = -np.log10(selector.pvalues_) plt.bar(range(len(features)), scores) plt.xticks(range(len(features)), features, rotation="vertical") plt.show()
def preprocess(article_file, lable_file, k): features = pickle.load(open(article_file)) features = np.array(features) # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels lables = pickle.load(open(lable_file)) le = preprocessing.LabelEncoder() le.fit(lables) lables = le.transform(lables) # print le.inverse_transform([0]) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1, stop_words='english') features_train_transformed = vectorizer.fit_transform(features) # selector : SelectPercentile # selector = SelectPercentile(f_classif, percentile=30) # selector.fit(features_train_transformed, lables) # selector : SelectKBest selector = SelectKBest(k=k) selector.fit(features_train_transformed, lables) # selector : chi2 # selector = SelectPercentile(score_func=chi2) # selector.fit(features_train_transformed, lables) features_train_transformed = selector.transform(features_train_transformed).toarray() return features_train_transformed, lables, vectorizer, selector, le, features
def __init__(self, master, x_train, y_train, x_test, y_test, evaluator, df, console): Tk.Frame.__init__(self, master) self.x_train = x_train self.y_train = y_train self.x_test = x_test self.y_test = y_test self.evaluator = evaluator self.df = df self.console = console frame_train = Tk.Frame(self) frame_train.pack(fill=Tk.BOTH, expand=1, padx=15, pady=15) plt.figure(figsize=(12, 20)) plt.subplot(111) # k best feature's names plt.figure(figsize=(12, 8)) plt.subplot(111) selection = SelectKBest(f_classif, k=3) selection.fit(self.x_train, self.y_train) feature_scores = selection.scores_ feature_names = df.columns.values feature_names = feature_names[feature_names != "NSP"] kbest_feature_indexes = selection.get_support() kbest_feature_names = feature_names[kbest_feature_indexes] # 存为DataFrame rec = zip(feature_scores, feature_names) data = pd.DataFrame(rec, columns=["Score", "Feature"]) sns.barplot(x="Feature", y="Score", data=data) plt.xticks(rotation=-90) plt.title("Cardiotocography Feature Scores Ranking") self.attach_figure(plt.gcf(), frame_train)
def data_yj(params): Ntrn = params['Ntrn'] Ntst = params['Ntst'] num_feat = params['num_feat'] lowd = params['lowd'] highd = params['highd'] seed = params['seed'] # Run Yousef/Jianping RNA Synthetic currdir = path.abspath('.') synloc = path.expanduser('~/GSP/research/samc/synthetic/rnaseq') YJparams = param_template.format(**params) try: os.chdir(synloc) fid,fname = tempfile.mkstemp(dir='params') fname = path.basename(fname) fid = os.fdopen(fid,'w') fid.write(YJparams) fid.close() inspec = 'gen -i params/%s -c 0.05 -l %f -h %f -s %d' % \ (fname, lowd, highd, seed) spec = path.join(synloc, inspec).split() sb.check_call(spec) except Exception as e: print "ERROR in data_yj: " + str(e) finally: os.chdir(currdir) try: trn_path = path.join(synloc, 'out','%s_trn.txt'%fname) tst_path = path.join(synloc, 'out','%s_tst.txt'%fname) raw_trn_data = np.loadtxt(trn_path, delimiter=',', skiprows=1) selector = SelectKBest(f_classif, k=num_feat) trn_labels = np.hstack(( np.zeros(Ntrn), np.ones(Ntrn) )) selector.fit(raw_trn_data, trn_labels) raw_tst_data = np.loadtxt(tst_path, delimiter=',', skiprows=1) except Exception as e: print "ERROR in data_yj: " + str(e) finally: os.remove(trn_path) os.remove(tst_path) trn0, trn1, tst0, tst1 = gen_labels(Ntrn, Ntrn, Ntst, Ntst) rawdata = np.vstack(( raw_trn_data, raw_tst_data )) pvind = selector.pvalues_.argsort() np.random.shuffle(pvind) feats = np.zeros(rawdata.shape[1], dtype=bool) feats[pvind[:num_feat]] = True calib = ~feats return rawdata, trn0, trn1, tst0, tst1, feats, calib
def feature_selection(): with open(CLF_PICKLE_FILENAME, "r") as classifier_infile: classifier = pickle.load(classifier_infile) dataset = load_dataset() features_list = load_featurelist(FEATURE_LIST_FILENAME) data = featureFormat(dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) k_best = SelectKBest(score_func=f_classif, k='all') k_best.fit(features, labels) scores = k_best.scores_ unsorted_pairs = zip(features_list[1:], scores) sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) print sorted_pairs k_best_features = dict(sorted_pairs) k_features = k_best_features.keys() accuracy_list=[] precision_list=[] recall_list=[] for k in range(1,len(k_features)+1): k_best_feature_list = k_features[0:k] k_best_feature_list.insert(0, 'poi') [accuracy, precision, recall] = tester('name', classifier, dataset, k_best_feature_list, folds = 500) accuracy_list.append(accuracy) precision_list.append(precision) recall_list.append(recall) """ x=np.linspace(1,len(k_best_feature_list)-1,len(k_best_feature_list)-1) plt.plot(x,recall_list,label="recall") plt.plot(x,precision_list,label="precision") plt.legend(loc="lower right") plt.xlabel('k best features') plt.ylabel('score') plt.title('Precision and Recall vs. # of Features') plt.savefig('score_function_k.png') """ # best # of features = 18 k_best_feature_list = k_features[0:18] k_best_feature_list.insert(0,'poi') """ # Using SelectPercentile selector = SelectPercentile(percentile=50) selector.fit(features,labels) print selector.scores_ indices=selector.get_support(indices=False) best_features=[] for elem in zip(indices,features_list[1:]): if elem[0]==True: best_features.append(elem[1]) best_features.insert(0,'poi') """ with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile: pickle.dump(k_best_feature_list, featurelist_outfile)
features_list += [ 'fraction_from_poi', 'fraction_to_poi', 'fraction_total_stock_value' ] print("Total de features", len(features_list)) my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) ###Ordenação de features best_features = SelectKBest() best_features.fit(features, labels) list_best_features = [] for n, i in enumerate(features_list[1:]): list_best_features.append({ "feature": i, "score": best_features.scores_[n] }) newlist = sorted(list_best_features, key=lambda k: k['score'], reverse=True) #print(newlist) features_list_new = [] for i in newlist: features_list_new.append(i["feature"]) features_list_new.insert(0, 'poi')
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification) import numpy as np import pandas as pd from pandas.plotting import scatter_matrix import matplotlib.pyplot as plt from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 # load data url = "/Users/HP/Desktop/S4/Machine Learning/Dataset/insurance.csv" names = ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'] df = pd.read_csv(url, names=names) df = df.apply(pd.to_numeric, errors='coerce') #print(df[['sex', 'smoker', 'region']].describe()) array = df.values X = array[:, 2:3] Y = array[:, 6] # feature extraction test = SelectKBest(score_func=chi2, k=4) fit = test.fit(X, Y) # summarize scores np.set_printoptions(precision=3) print(fit.scores_) features = fit.transform(X) # summarize selected features print(features[0:5, :]) # scatter_matrix(df[['age', 'bmi', 'children', 'charges']]) # plt.show()
plt.figure() plt.grid(axis='both') # Features in the dataset col = [ 'meanNN', 'STD_NN', 'HR', 'Lfnu', 'Hfnu', 'LF/HF', 'APEN', 'CD', 'PTT', 'PTT_SD' ] # Synthetic Minority oversampling technique to balance the dataset oversample = SMOTE() X, y = oversample.fit_resample(X, y) # Feature Selection feature_scores = SelectKBest(score_func=chi2, k=10) fit = feature_scores.fit(X, y) dfscores = pd.DataFrame(fit.scores_) dfcol = pd.DataFrame(X.columns) visual = pd.concat([dfcol, dfscores], axis=1) visual.columns = ['Specs', 'Score'] # Applying the feature values to the features for i in range(len(list(fit.scores_ / np.mean(df[X.columns])))): X.iloc[:, i] *= (fit.scores_[i] / max(fit.scores_)) # Split the dataset into train and test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=5)
def feature_selection(X, Y): Selector = SelectKBest(chi2, k=70) Selector.fit(X, Y) return Selector.transform(X), Y, Selector.get_support(True)
print(newdf_test['label'].value_counts()) X_U2R = newdf.drop('label', 1) Y_U2R = newdf.label X_U2R_test = newdf_test.drop('label', 1) Y_U2R_test = newdf_test.label colNames = list(X_U2R) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif np.seterr(divide='ignore', invalid='ignore') fclass = SelectKBest( f_classif, k=2) #iterate the k from 1 to 120. The max. accuracy comes at k=2 . fclass.fit(X_U2R, Y_U2R) true = fclass.get_support() fclasscolindex_U2R = [i for i, x in enumerate(true) if x] fclasscolname_U2R = list(colNames[i] for i in fclasscolindex_U2R) print('Features selected :', fclasscolname_U2R) features = newdf[fclasscolname_U2R].astype(float) features1 = newdf_test[fclasscolname_U2R].astype(float) lab = newdf['label'] lab1 = newdf_test['label'] from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier() t0 = time() clf.fit(features, lab) tt = time() - t0
sst = StandardScaler() Xtrain2 = sst.fit_transform(Xtrain) Xtest2 = sst.transform(Xtest) model2 = LinearRegression() model2.fit(Xtrain2, ytrain) #3. PCA + Linear regression() tPCA = PCA(n_components=1) Xtrain3 = tPCA.fit_transform(Xtrain) Xtest3 = tPCA.transform(Xtest) model3 = LinearRegression() model3.fit(Xtrain3, ytrain) #4. PCA + Linear regression() fsk = SelectKBest(score_func=f_regression, k=1) fsk.fit(Xtrain, ytrain) Xtrain4 = fsk.transform(Xtrain) Xtest4 = fsk.transform(Xtest) model4 = LinearRegression() model4.fit(Xtrain4, ytrain) #. Select by coef_ + Linear regression() Xtrain5 = Xtrain.iloc[:, 0].values.reshape(-1, 1) Xtest5 = Xtest.iloc[:, 0].values.reshape(-1, 1) model5 = LinearRegression() model5.fit(Xtrain5, ytrain) print('Score1:' + str(model1.score(Xtest, ytest) * 100) + '\nScore2:' + str(model2.score(Xtest2, ytest) * 100) + '\nScore3:' + str(model3.score(Xtest3, ytest) * 100) + '\nScore4:' + str(model4.score(Xtest4, ytest) * 100) + '\nScore5:' +
#r=f.readlines() for i in embeddings_index.keys(): if (i not in stop_words): glove.append(i) #f.close() #tfidf transformer = TfidfTransformer(smooth_idf=True) count_vectorizer = CountVectorizer(ngram_range=(2, 3), vocabulary=glove) counts = count_vectorizer.fit_transform(df['text'].values) tfidf = transformer.fit_transform(counts) target = df['label'].values.astype('int') selector = SelectKBest(chi2, k=1000) selector.fit(tfidf, target) top_words = selector.get_support().nonzero() # Pick only the most informative columns in the data. chi_matrix = tfidf[:, top_words[0]] # In[150]: # Our list of functions to apply. transform_functions = [ lambda x: x.count(" ") / len(x.split()), lambda x: x.count(".") / len(x.split()), lambda x: x.count("!") / len(x.split()), lambda x: x.count("?") / len(x.split()), lambda x: x.count("-") / len(x.split()), lambda x: x.count(",") / len(x.split()),
def discriminator(tweet_list, tweet_list_y, count_fake, count_total): list_words = ['http', 'https', 'twitter', 'com', 'www'] count_vectorizer = CountVectorizer(stop_words=list_words) count_train = count_vectorizer.fit_transform(X_train) count_test = count_vectorizer.transform(tweet_list) tfidf_vectorizer = TfidfVectorizer(stop_words=list_words, max_df=0.7) tfidf_train = tfidf_vectorizer.fit_transform(X_train) tfidf_test = tfidf_vectorizer.transform(X_test) clf = SelectKBest(score_func=mutual_info_classif, k=1000) fit = clf.fit(count_train, y_train) count_x_train_ft = fit.transform(count_train) count_x_test_ft = fit.transform(count_test) clf = SelectKBest(score_func=mutual_info_classif, k=1000) fit = clf.fit(tfidf_train, y_train) tfidf_x_train_ft = fit.transform(tfidf_train) tfidf_x_test_ft = fit.transform(tfidf_test) print("MultinomialNB CountVectorizer") mn_count_clf = MultinomialNB() mn_count_clf.fit(count_x_train_ft, y_train) pred = mn_count_clf.predict(count_x_test_ft) score = metrics.accuracy_score(tweet_list_y, pred) print("accuracy: %0.3f" % score) print(" ") final_score = (score * np.shape(tweet_list)[0] + count_fake) / (np.shape(tweet_list)[0] + count_total) print("final_acc: ", final_score) print("MultinomialNB TfidfVectorizer") mn_tfidf_clf = MultinomialNB() mn_tfidf_clf.fit(tfidf_x_train_ft, y_train) pred = mn_tfidf_clf.predict(tfidf_x_test_ft) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) print(" ") final_score = (score * np.shape(tweet_list)[0] + count_fake) / (np.shape(tweet_list)[0] + count_total) print("final_acc: ", final_score) print("PassiveAggressiveClassifier: C = 0.01") pa_tfidf_clf = PassiveAggressiveClassifier(max_iter=50, C=0.01) pa_tfidf_clf.fit(count_x_train_ft, y_train) pred = pa_tfidf_clf.predict(count_x_test_ft) score = metrics.accuracy_score(tweet_list_y, pred) print("accuracy: %0.3f" % score) print(" ") final_score = (score * np.shape(tweet_list)[0] + count_fake) / (np.shape(tweet_list)[0] + count_total) print("final_acc: ", final_score) print("LinearSVC: C = 0") svc_tfidf_clf = LinearSVC() svc_tfidf_clf.fit(count_x_train_ft, y_train) pred = svc_tfidf_clf.predict(count_x_test_ft) score = metrics.accuracy_score(tweet_list_y, pred) print("accuracy: %0.3f" % score) print(" ") final_score = (score * np.shape(tweet_list)[0] + count_fake) / (np.shape(tweet_list)[0] + count_total) print("final_acc: ", final_score) print("SGDClassifier") sgd_tfidf_clf = SGDClassifier(max_iter=50) sgd_tfidf_clf.fit(count_x_train_ft, y_train) pred = sgd_tfidf_clf.predict(count_x_test_ft) score = metrics.accuracy_score(tweet_list_y, pred) print("accuracy: %0.3f" % score) print(" ") final_score = (score * np.shape(tweet_list)[0] + count_fake) / (np.shape(tweet_list)[0] + count_total) print("final_acc: ", final_score)