Esempio n. 1
0
class TfIdf(Feature):
    def __init__(self):
        self.kbest = None
        self.vect = None
        self.truncated = None
        self.normalizer = None

    def train(self, reviews, labels):
        self.vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')

        reviews_text = [' '.join(list(chain.from_iterable(review))) for review in reviews]
        tfidf_matrix = self.vect.fit_transform(reviews_text).toarray()

        self.truncated = TruncatedSVD(n_components=50)
        self.truncated.fit(tfidf_matrix, labels)

        trunc = self.truncated.transform(tfidf_matrix)
        self.normalizer = Normalizer()
        self.normalizer.fit(trunc)

        self.kbest = SelectKBest(f_classif, k=5)
        self.kbest.fit(self.normalizer.transform(trunc), labels)

    def score(self, data):
        reviews_text = ' '.join(list(chain.from_iterable(data)))
        tfidf_matrix = self.vect.transform([reviews_text]).toarray()

        trunc = self.truncated.transform(tfidf_matrix)

        return tuple(self.kbest.transform(self.normalizer.transform(trunc))[0, :])
Esempio n. 2
0
def anova_best_features(X, y, top = 30):
    kbest = SelectKBest(f_classif)
    kbest.fit(X, y)
    feat_imp = pd.Series(kbest.scores_, index=X.columns)
    feat_imp.sort_values(inplace=True)
    ax = feat_imp.tail(top).plot(kind='barh', figsize=(10,7), title='Feature importance (f_classif)')
    return feat_imp
def SelectFeatures (TrainFVs,
                    TrainLabels,
                    FeatuesToSelect,
                    FSAlgo,
                    DispSelectedFeatures=False,
                    Vocab=None):
    SelectedFeaturesTransformer = SelectKBest(FSAlgo, k = FeatuesToSelect)
    SelectedFeaturesTransformer.fit(TrainFVs, TrainLabels)
    FeautureImportances = np.array(SelectedFeaturesTransformer.scores_ )

    print 'total # of features: ', len (FeautureImportances)
    raw_input()
    TopFeatureIndices = FeautureImportances.argsort()[-FeatuesToSelect:][::-1]
    print FeautureImportances
    # print SelectedFeaturesTransformer.pvalues_
    # TrainFVs = SelectedFeaturesTransformer.fit_transform(TrainFVs, TrainLabels)
    # TestFVs = SelectedFeaturesTransformer.transform(TestFVs)
    # logger.debug ("after feature selection the shape of \
    # training and test arrays %s %s", TrainFVs.shape, TestFVs.shape)

    if DispSelectedFeatures:
        # SelectedFeatures =  SelectedFeaturesTransformer.get_support(indices=True)
        # for F in SelectedFeatures:
        #     print AllFeatures[F]
        for FIndex in TopFeatureIndices:
            print Vocab[FIndex]

    return [Vocab[FIndex] for FIndex in TopFeatureIndices]
Esempio n. 4
0
def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True):
    '''
    Gets best features using chosen method
    (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr),
    then prints top K features' names (from featNames).
    If reduceMatrix =  True, then also returns X reduced to the K best features.

    Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'..
    Note, that effectiveyl, Any scikit learn method could be used, if correctly imported..
    '''
    #est = method()
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    '''
    features, labels, lb_encoder,featureNames = load_data(filename)
    X, y = features, labels

    # change the names as ints back to strings
    class_names=lb_encoder.inverse_transform(y)
    print("Data and labels imported. PreFilter Feature matrix shape:")
    print(X.shape)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print('X After K filter:',X.shape)
    print("K_featnames: %s" %(K_featnames))
    if reduceMatrix ==True :
        Reduced_df = pd.read_csv(filename, index_col=0)
        Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
        Reduced_df.to_csv('REDUCED_Feat.csv')
        print('Saved to REDUCED_Feat.csv')
        return Reduced_df
def get_k_best(dictionary, features_list, k):
    """ runs scikit-learn's SelectKBest feature selection returning:
    {feature:score}
    """
    data = featureFormat(dictionary, features_list)
    labels, features = targetFeatureSplit(data)

    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_
    pairs = zip(features_list[1:], scores)
    #combined scores and features into a pandas dataframe then sort 
    k_best_features = pd.DataFrame(pairs,columns = ['feature','score'])
    k_best_features = k_best_features.sort('score',ascending = False)
    
    
    #merge with null counts    
    df_nan_counts = get_nan_counts(dictionary)
    k_best_features = pd.merge(k_best_features,df_nan_counts,on= 'feature')  
    
    #eliminate infinite values
    k_best_features = k_best_features[np.isinf(k_best_features.score)==False]
    print 'Feature Selection by k_best_features\n'
    print "{0} best features in descending order: {1}\n".format(k, k_best_features.feature.values[:k])
    print '{0}\n'.format(k_best_features[:k])
    
    
    return k_best_features[:k]
def splitIntoTrainingAndValidation(A, B):
	data1 = shuffle(sourceSets[A])    # Note this is a random shuffle, that's
	data2 = shuffle(sourceSets[B])    #                                   why we need many iterations
	freqM = np.minimum(freqs[A], freqs[B])
	freq1tr = np.round(freqM * 0.8)        # Randomly selected 80% for the training set,
	freq1va = freqM - freq1tr              # and the remaining 20% for the validation set
	freq2tr = np.copy(freq1tr)
	freq2va = np.copy(freq1va)
	trainingSetSize = int(sum(freq1tr))  # 1/2 size actually
	validatnSetSize = int(sum(freq1va))
	testSet1size = len(data1) - trainingSetSize - validatnSetSize
	testSet2size = len(data2) - trainingSetSize - validatnSetSize
	X  = np.zeros((trainingSetSize*2,         numFeatures))
	Xv = np.zeros((validatnSetSize*2,         numFeatures))
	Xt = np.zeros((testSet1size+testSet2size, numFeatures))
	y  = np.ravel([([0]*trainingSetSize) + ([1]*trainingSetSize)])
	yv = np.ravel([([0]*validatnSetSize) + ([1]*validatnSetSize)])
	yt = np.ravel([([0]*testSet1size)    + ([1]*testSet2size)])
	trnIdx = vldIdx = tstIdx = 0
	for item in data1:
		year = item[0]
		if   freq1tr[year] > 0:   X[trnIdx], trnIdx, freq1tr[year]  =  item[1:],  trnIdx+1,  freq1tr[year]-1
		elif freq1va[year] > 0:  Xv[vldIdx], vldIdx, freq1va[year]  =  item[1:],  vldIdx+1,  freq1va[year]-1
		else:                    Xt[tstIdx], tstIdx                 =  item[1:],  tstIdx+1
	assert trnIdx==trainingSetSize   and vldIdx==validatnSetSize   and tstIdx==testSet1size
	for item in data2:
		year = item[0]
		if   freq2tr[year] > 0:   X[trnIdx], trnIdx, freq2tr[year]  =  item[1:],  trnIdx+1,  freq2tr[year]-1
		elif freq2va[year] > 0:  Xv[vldIdx], vldIdx, freq2va[year]  =  item[1:],  vldIdx+1,  freq2va[year]-1
		else:                    Xt[tstIdx], tstIdx                 =  item[1:],  tstIdx+1
	assert trnIdx==trainingSetSize*2 and vldIdx==validatnSetSize*2 and tstIdx==testSet1size+testSet2size
	X, y = shuffle(X, y)   # Just in case... perhaps no reason to shuffle again here?
	fs = SelectKBest(f_classif, k = numFeatures)   # TODO: try other feature selection methods?
	fs.fit(np.concatenate((X, Xv)), np.concatenate((y, yv)))
	return X, y, Xv, yv, Xt, yt, testSet1size, testSet2size, fs.scores_
Esempio n. 7
0
    def _SelectKBest(self, X, y):

        print('Selecting K Best from whole image')

        from sklearn.feature_selection import SelectKBest, f_classif

        # ### Define the dimension reduction to be used.
        # Here we use a classical univariate feature selection based on F-test,
        # namely Anova. The number of features to be selected is set to 784
        feature_selection = SelectKBest(f_classif, k=self.k_features)

        feature_selection.fit(X, y)

        scores = f_classif(X, y)[0]
        mask_k_best = np.zeros(scores.shape, dtype=bool)
        mask_k_best[np.argsort(scores, kind="mergesort")[-self.k_features:]]\
            = 1
        import nibabel
        mask_brain_img = nibabel.load(self.mask_non_brain).get_data()
        mask_brain = mask_brain_img.flatten().astype(bool)

        roi = np.zeros(mask_brain.flatten().shape)
        roi[mask_brain] = mask_k_best
        roi = roi.reshape(mask_brain_img.shape)

        img = nibabel.Nifti1Image(roi, np.eye(4))
        img.to_filename('/tmp/best.nii.gz')

        print('SelectKBest data reduction from: %s' % str(X.shape))
        X = feature_selection.transform(X)
        print('SelectKBest data reduction to: %s' % str(X.shape))

        self.feature_reduction_method = feature_selection

        return X
Esempio n. 8
0
def selectBestFeatures(data_dict, features_list, k, print_result):
    '''
    Using SelectKBest, find k best features.

    param:
        data_dict     : data set
        features_list : dlist of feature
        k             : number of max feature
    return:
        best_features : list of select features
    '''
    best_features = {}
    
    #data = featureFormat(data_dict, features_list)
    #labels, features = targetFeatureSplit(data)
    labels, features = getFeaturesAndLabels(data_dict, features_list)

    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    
    unsorted_pair_list = zip(features_list[1:], k_best.scores_)
    sorted_pair_list   = sorted(unsorted_pair_list, key = lambda x: x[1], reverse = True)
    
    k_features = [pair[0] for pair in sorted_pair_list]
    k_scores   = [pair[1] for pair in sorted_pair_list]

    best_features['feature'] = k_features[:k]
    best_features['score']   = k_scores[:k]
    
    if print_result:
        #print final result
        print "--- Selet K Best Score ---"
        print pd.DataFrame(best_features)
        
    return best_features['feature']
Esempio n. 9
0
    def fit(self, X, y):

        support = range(X.shape[1])
        X0 = X
        while X.shape[1] > self.n_features:
            new_size = max(int(X.shape[1]*self.step), self.n_features)

            kbest = SelectKBest(f_regression, k=new_size)
            kbest.fit(X, y)
            score1 = kbest.scores_

            self.estimator.fit(X, y)
            score2 = abs(self.estimator.coef_)

            score1 = score1 / max(score1)
            score2 = (score2 / max(score2))**2

            score = (1 - self.p) * score1 + self.p * score2

            coefs = zip(score, support)
            coefs = sorted(coefs, key=lambda (a,b): a, reverse=True)
            support = [b for (a, b) in coefs[:new_size]]
            X = X0[:, support]
        self.estimator.fit(X, y)
        self.support = support
Esempio n. 10
0
def PerformFeatureSelection(adult_train, features, Output):
	selector = SelectKBest(f_classif, k=5)
	selector.fit(adult_train[features], adult_train[Output])
	scores = -numpy.log10(selector.pvalues_)
	plt.bar(range(len(features)), scores)
	plt.xticks(range(len(features)), features, rotation='vertical')
	plt.show()
Esempio n. 11
0
def select_parameter():
    selector = SelectKBest(f_classif, k=5)
    selector.fit(titanic[predictors], titanic["Survived"])
    scores = -numpy.log10(selector.pvalues_)
    plt.bar(range(len(predictors)), scores)
    plt.xticks(range(len(predictors)), predictors, rotation='vertical')
    plt.show()
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Esempio n. 13
0
def discriminatory_features():
	print 'Finding most discriminatory features...'

	NUM_FEATURES = 10

	all_points = class1_song_points + class2_song_points
	true_labels = [0]*len(class1_song_points)+[1]*len(class2_song_points)

	feature_indices = []
	for i in range(NUM_FEATURES):
		selector = SelectKBest(chi2, i+1)
		selector.fit(all_points, true_labels)

		new_indices = selector.get_support(indices=True)
		for index in new_indices:
			if index not in feature_indices:
				feature_indices.append(index)

	feature_descriptions = []

	for index in feature_indices:
		feature = feature_names[index]
		if feature.lower() in wsj_mapping.keys():
			key = wsj_mapping[feature.lower()]
			description = key + ': ' + wsj_to_description[key]
		elif feature in word_vocab:
			description = 'The word: ' + feature
		else:
			description = feature
		feature_descriptions.append(description)
	return jsonify(features=feature_descriptions)
def get_k_best(df, features_list, k):
    """ runs scikit-learn's SelectKBest feature selection
        returns dict where keys=features, values=scores
    """
    # feature, label = feature_format_scale(data_dict, features_list)
    from poi_dataprocess import *
    from feature_format import featureFormat, targetFeatureSplit

    data_dict_new = df[features_list].T.to_dict()

    data = featureFormat(data_dict_new, features_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    # df = df[features_list]
    # features = df.drop('poi', axis=1)#.astype(float)
    # labels = df['poi']

    from sklearn import preprocessing

    scaler = preprocessing.MinMaxScaler()
    features = scaler.fit_transform(features)

    from sklearn.feature_selection import SelectKBest

    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_
    unsorted_pairs = zip(features_list[1:], scores)
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    k_best_features = dict(sorted_pairs[:k])

    return k_best_features
Esempio n. 15
0
 def __anovaImportance(self, features, labels, features_list):
     sel = SelectKBest(f_classif, k=2)
     sel.fit(features, labels)
     sortIndexes = sel.scores_.argsort()[::-1]
     features_rank = np.array(features_list[1:])[sortIndexes]
     print "anova f test importance rank: ", features_rank
     return features_rank
def get_k_best_features(data_dict, features_list, k):

  """
  runs scikit-learn's SelectKBest feature selection to get k best features
    
  Args:
    data_dict: data dictionary for enron
    feature_list: a list of features with first feature as target label
    k: Number of best features which need to be selected

  Returns:
    returns a list of k best features and list of lists where inner list's 
    first element is feature and the second element is feature score
  """

  data = featureFormat(data_dict, features_list)
  labels, features = targetFeatureSplit(data)

  k_best = SelectKBest(k=k)
  k_best.fit(features, labels)
  scores = k_best.scores_
  unsorted_pairs = zip(features_list[1:], scores)
  sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
  k_best_features = dict(sorted_pairs[:k])
  return k_best_features.keys(), map(list, sorted_pairs)
def k_best_feature_selection(labels, features, features_list):
    """ 
        Identifies the best features using SelectKBest feature selection
    
        labels = target list as returned by the targetFeatureSplit script
        features = features list as returned by the targetFeatureSplit script
        features_list = list of the features to be assessed
    """
    from sklearn.feature_selection import SelectKBest
    
    k_best = SelectKBest(k = 10)
    k_best.fit(features, labels)
    scores = k_best.scores_
    
    features_list = features_list[1:]
    feature_scores = zip(features_list, scores)
    feature_scores = sorted(feature_scores, key = lambda x: x[1])
    feature_scores = feature_scores[::-1]

    print feature_scores
    
    print "Top 10 features identifed using SelectKBest:"
    i = 1
    while i < 11:
        print "  ", i, "-", feature_scores[i-1]
        i += 1
def test_mutual_info_classif():
    X, y = make_classification(
        n_samples=100,
        n_features=5,
        n_informative=1,
        n_redundant=1,
        n_repeated=0,
        n_classes=2,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    # Test in KBest mode.
    univariate_filter = SelectKBest(mutual_info_classif, k=2)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(5)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)

    # Test in Percentile mode.
    univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(5)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)
def select_k_best(data_dict, features_list, k):
    # Create dataset from feature list
    data = featureFormat(data_dict, features_list)
    # Split dataset into labels and features
    labels, features = targetFeatureSplit(data)
    # Create Min/Max Scaler
    scaler = preprocessing.MinMaxScaler()
    # Scale Features
    features = scaler.fit_transform(features)
    # Create k_best feature selection
    k_best = SelectKBest(k=k)
    # Fit k_best
    k_best.fit(features, labels)
    # Get k_best scores
    scores = k_best.scores_
    # Create list with features and scores
    unsorted_pairs = zip(features_list[1:], scores)
    # Sort list
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    # Create dict
    if k == "all":
        k_best_features = dict(sorted_pairs)
    else:
        k_best_features = dict(sorted_pairs[:k])
    return k_best_features
Esempio n. 20
0
def choseFeature(TrainX, TrainY, TestX):
	cF = SelectKBest(chi2, k=100)
	cF.fit(TrainX, TrainY)
	check = cF.get_support()
	newTrainX = cF.transform(TrainX)
	newTestX = cF.transform(TestX)
	return (newTrainX, newTestX)
def features_importance(features_train, labels_train, feature_list):   
    X=SelectKBest()
    X.fit(features_train, labels_train)
    Scores=X.scores_
    Pvalues=X.pvalues_
    index=feature_list[1:]
    return pd.DataFrame({'Scores': Scores,'Pvalues': Pvalues},index=index)
Esempio n. 22
0
class FeatureSelector(BaseEstimator, TransformerMixin):

    def __init__(self, score_func=None, percentile=None, k_best=None):

        self.score_func = score_func
        self.percentile = percentile
        self.k_best = k_best
        self.selector = None

    def fit(self, x, y):

        if self.k_best is None and self.percentile is not None:
            self.selector = SelectPercentile(score_func=self.score_func, percentile=self.percentile)
            self.selector.fit(x, y)
            return self

        elif self.k_best is not None and self.percentile is None:
            self.selector = SelectKBest(score_func=self.score_func, k=self.k_best)
            self.selector.fit(x, y)
            return self
        else:
            raise ValueError("You should select between percentile or # best features")

    def transform(self, x):
        # print "# Features reduced from {} to {}".format(x.columns.shape[0],\
        #                                                x.columns[self.selector.get_support()].values.shape[0])
        x_transformed = pd.DataFrame(data=self.selector.transform(x), columns=x.columns[self.selector.get_support()],
                                     index=x.index)
        return x_transformed
def test_mutual_info_regression():
    X, y = make_regression(n_samples=100, n_features=10, n_informative=2,
                           shuffle=False, random_state=0, noise=10)

    # Test in KBest mode.
    univariate_filter = SelectKBest(mutual_info_regression, k=2)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(
        mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)

    # Test in Percentile mode.
    univariate_filter = SelectPercentile(mutual_info_regression, percentile=20)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile',
                                   param=20).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])  
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models= [ 'pH_rdg_prds', 'pH_las_prds', 
              'pH_for_prds', 'pH_for_prds' ] 
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
def select_k_best_features(dataset, features_list, k):
    """
    For E+F dataset, select k best features based on SelectKBest from 
    sklearn.feature_selection

    Input:
    dataset: data in dictionary format 
    features_list: the full list of features to selection from 
    k: the number of features to keep

    Return:
    the list of length of k+1 with the first element as 'poi' and other 
    k best features 

    """
    labels_train, __, features_train, __ = \
    test_training_stratified_split(dataset, features_list)
    
    k_best = SelectKBest(k=k)
    k_best.fit(features_train, labels_train)
    impt_unsorted = zip(features_list[1:], k_best.scores_)
    impt_sorted = list(sorted(impt_unsorted, key=lambda x: x[1], reverse=True))
    k_best_features = [elem[0] for elem in impt_sorted][:k]
    print k, "best features:"
    print k_best_features
    return ['poi'] + k_best_features
Esempio n. 26
0
def find_features(dataset, features, target):
    selector = SelectKBest(f_classif, k=5)
    selector.fit(dataset[features], dataset[target[0]])
    scores = -np.log10(selector.pvalues_)
    plt.bar(range(len(features)), scores)
    plt.xticks(range(len(features)), features, rotation="vertical")
    plt.show()
def preprocess(article_file, lable_file, k):

    features = pickle.load(open(article_file))
    features = np.array(features)

    # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels
    lables = pickle.load(open(lable_file))
    le = preprocessing.LabelEncoder()
    le.fit(lables)
    lables = le.transform(lables)
    # print le.inverse_transform([0])

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features)

    # selector : SelectPercentile
    # selector = SelectPercentile(f_classif, percentile=30)
    # selector.fit(features_train_transformed, lables)

    # selector : SelectKBest
    selector = SelectKBest(k=k)
    selector.fit(features_train_transformed, lables)

    # selector : chi2
    # selector = SelectPercentile(score_func=chi2)
    # selector.fit(features_train_transformed, lables)

    features_train_transformed = selector.transform(features_train_transformed).toarray()

    return features_train_transformed, lables, vectorizer, selector, le, features
    def __init__(self, master, x_train, y_train, x_test, y_test, evaluator, df, console):
        Tk.Frame.__init__(self, master)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.evaluator = evaluator
        self.df = df
        self.console = console

        frame_train = Tk.Frame(self)
        frame_train.pack(fill=Tk.BOTH, expand=1, padx=15, pady=15)
        plt.figure(figsize=(12, 20))
        plt.subplot(111)
        
        # k best feature's names
        plt.figure(figsize=(12, 8))
        plt.subplot(111)
        selection = SelectKBest(f_classif, k=3)
        selection.fit(self.x_train, self.y_train)
        feature_scores = selection.scores_
        feature_names = df.columns.values
        feature_names = feature_names[feature_names != "NSP"]
        kbest_feature_indexes = selection.get_support()
        kbest_feature_names = feature_names[kbest_feature_indexes]

        # 存为DataFrame
        rec = zip(feature_scores, feature_names)
        data = pd.DataFrame(rec, columns=["Score", "Feature"])

        sns.barplot(x="Feature", y="Score", data=data)
        plt.xticks(rotation=-90)
        plt.title("Cardiotocography Feature Scores Ranking")
        self.attach_figure(plt.gcf(), frame_train)
Esempio n. 29
0
def data_yj(params):
    Ntrn = params['Ntrn']
    Ntst = params['Ntst']
    num_feat = params['num_feat']
    lowd = params['lowd']
    highd = params['highd']
    seed = params['seed']
    # Run Yousef/Jianping RNA Synthetic
    currdir = path.abspath('.')
    synloc = path.expanduser('~/GSP/research/samc/synthetic/rnaseq')

    YJparams = param_template.format(**params)

    try:
        os.chdir(synloc)
        fid,fname = tempfile.mkstemp(dir='params')
        fname = path.basename(fname)
        fid = os.fdopen(fid,'w')
        fid.write(YJparams)
        fid.close()
        inspec = 'gen -i params/%s -c 0.05 -l %f -h %f -s %d' % \
                (fname, lowd, highd, seed)
        spec = path.join(synloc, inspec).split()
        sb.check_call(spec)
    except Exception as e:
        print "ERROR in data_yj: " + str(e)
    finally:
        os.chdir(currdir)

    try:
        trn_path = path.join(synloc, 'out','%s_trn.txt'%fname)
        tst_path = path.join(synloc, 'out','%s_tst.txt'%fname)

        raw_trn_data = np.loadtxt(trn_path,
            delimiter=',', skiprows=1)
        selector = SelectKBest(f_classif, k=num_feat)
        trn_labels = np.hstack(( np.zeros(Ntrn), np.ones(Ntrn) ))
        selector.fit(raw_trn_data, trn_labels)

        raw_tst_data = np.loadtxt(tst_path,
                delimiter=',', skiprows=1)
    except Exception as e:
        print "ERROR in data_yj: " + str(e)
    finally:
        os.remove(trn_path)
        os.remove(tst_path)

    trn0, trn1, tst0, tst1 = gen_labels(Ntrn, Ntrn, Ntst, Ntst)
    rawdata = np.vstack(( raw_trn_data, raw_tst_data ))

    pvind = selector.pvalues_.argsort()

    np.random.shuffle(pvind)

    feats = np.zeros(rawdata.shape[1], dtype=bool)
    feats[pvind[:num_feat]] = True
    calib = ~feats

    return rawdata, trn0, trn1, tst0, tst1, feats, calib 
Esempio n. 30
0
def feature_selection():
    with open(CLF_PICKLE_FILENAME, "r") as classifier_infile:
        classifier = pickle.load(classifier_infile)
        
    dataset = load_dataset()
    features_list = load_featurelist(FEATURE_LIST_FILENAME)
    
    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    
    k_best = SelectKBest(score_func=f_classif, k='all')
    k_best.fit(features, labels)
    scores = k_best.scores_
    unsorted_pairs = zip(features_list[1:], scores)
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    print sorted_pairs
    k_best_features = dict(sorted_pairs)
    k_features = k_best_features.keys()
    
    accuracy_list=[]
    precision_list=[]
    recall_list=[]
    for k in range(1,len(k_features)+1):
        k_best_feature_list = k_features[0:k]
        k_best_feature_list.insert(0, 'poi')
        [accuracy, precision, recall] = tester('name', classifier, dataset, k_best_feature_list, folds = 500)
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
    """ 
    x=np.linspace(1,len(k_best_feature_list)-1,len(k_best_feature_list)-1)
    plt.plot(x,recall_list,label="recall")
    plt.plot(x,precision_list,label="precision")
    plt.legend(loc="lower right")
    plt.xlabel('k best features')
    plt.ylabel('score')
    plt.title('Precision and Recall vs. # of Features')
    plt.savefig('score_function_k.png')
    """
    
    # best # of features = 18
    k_best_feature_list = k_features[0:18]
    k_best_feature_list.insert(0,'poi')
    
    """ # Using SelectPercentile
    selector = SelectPercentile(percentile=50)
    selector.fit(features,labels)
    print selector.scores_
    indices=selector.get_support(indices=False)
    
    best_features=[]
    for elem in zip(indices,features_list[1:]):
        if elem[0]==True:
            best_features.append(elem[1])
    
    best_features.insert(0,'poi')
    """
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(k_best_feature_list, featurelist_outfile)
Esempio n. 31
0
features_list += [
    'fraction_from_poi', 'fraction_to_poi', 'fraction_total_stock_value'
]

print("Total de features", len(features_list))
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

###Ordenação de features

best_features = SelectKBest()
best_features.fit(features, labels)

list_best_features = []
for n, i in enumerate(features_list[1:]):
    list_best_features.append({
        "feature": i,
        "score": best_features.scores_[n]
    })

newlist = sorted(list_best_features, key=lambda k: k['score'], reverse=True)
#print(newlist)

features_list_new = []
for i in newlist:
    features_list_new.append(i["feature"])
features_list_new.insert(0, 'poi')
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# load data
url = "/Users/HP/Desktop/S4/Machine Learning/Dataset/insurance.csv"
names = ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
df = pd.read_csv(url, names=names)
df = df.apply(pd.to_numeric, errors='coerce')
#print(df[['sex', 'smoker', 'region']].describe())
array = df.values
X = array[:, 2:3]
Y = array[:, 6]
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5, :])

# scatter_matrix(df[['age', 'bmi', 'children', 'charges']])
# plt.show()
Esempio n. 33
0
plt.figure()
plt.grid(axis='both')

# Features in the dataset
col = [
    'meanNN', 'STD_NN', 'HR', 'Lfnu', 'Hfnu', 'LF/HF', 'APEN', 'CD', 'PTT',
    'PTT_SD'
]

# Synthetic Minority oversampling technique to balance the dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

# Feature Selection
feature_scores = SelectKBest(score_func=chi2, k=10)
fit = feature_scores.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcol = pd.DataFrame(X.columns)
visual = pd.concat([dfcol, dfscores], axis=1)
visual.columns = ['Specs', 'Score']

# Applying the feature values to the features
for i in range(len(list(fit.scores_ / np.mean(df[X.columns])))):
    X.iloc[:, i] *= (fit.scores_[i] / max(fit.scores_))

# Split the dataset into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=5)
Esempio n. 34
0
def feature_selection(X, Y):
    Selector = SelectKBest(chi2, k=70)
    Selector.fit(X, Y)
    return Selector.transform(X), Y, Selector.get_support(True)
Esempio n. 35
0
print(newdf_test['label'].value_counts())

X_U2R = newdf.drop('label', 1)
Y_U2R = newdf.label
X_U2R_test = newdf_test.drop('label', 1)
Y_U2R_test = newdf_test.label

colNames = list(X_U2R)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
np.seterr(divide='ignore', invalid='ignore')
fclass = SelectKBest(
    f_classif,
    k=2)  #iterate the k from 1 to 120. The max. accuracy comes at k=2 .
fclass.fit(X_U2R, Y_U2R)
true = fclass.get_support()
fclasscolindex_U2R = [i for i, x in enumerate(true) if x]
fclasscolname_U2R = list(colNames[i] for i in fclasscolindex_U2R)
print('Features selected :', fclasscolname_U2R)

features = newdf[fclasscolname_U2R].astype(float)
features1 = newdf_test[fclasscolname_U2R].astype(float)
lab = newdf['label']
lab1 = newdf_test['label']

from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
t0 = time()
clf.fit(features, lab)
tt = time() - t0
sst = StandardScaler()
Xtrain2 = sst.fit_transform(Xtrain)
Xtest2 = sst.transform(Xtest)
model2 = LinearRegression()
model2.fit(Xtrain2, ytrain)

#3. PCA + Linear regression()
tPCA = PCA(n_components=1)
Xtrain3 = tPCA.fit_transform(Xtrain)
Xtest3 = tPCA.transform(Xtest)
model3 = LinearRegression()
model3.fit(Xtrain3, ytrain)

#4. PCA + Linear regression()
fsk = SelectKBest(score_func=f_regression, k=1)
fsk.fit(Xtrain, ytrain)
Xtrain4 = fsk.transform(Xtrain)
Xtest4 = fsk.transform(Xtest)
model4 = LinearRegression()
model4.fit(Xtrain4, ytrain)

#. Select by coef_ + Linear regression()
Xtrain5 = Xtrain.iloc[:, 0].values.reshape(-1, 1)
Xtest5 = Xtest.iloc[:, 0].values.reshape(-1, 1)
model5 = LinearRegression()
model5.fit(Xtrain5, ytrain)

print('Score1:' + str(model1.score(Xtest, ytest) * 100) + '\nScore2:' +
      str(model2.score(Xtest2, ytest) * 100) + '\nScore3:' +
      str(model3.score(Xtest3, ytest) * 100) + '\nScore4:' +
      str(model4.score(Xtest4, ytest) * 100) + '\nScore5:' +
Esempio n. 37
0
#r=f.readlines()
for i in embeddings_index.keys():
    if (i not in stop_words):
        glove.append(i)

#f.close()

#tfidf
transformer = TfidfTransformer(smooth_idf=True)
count_vectorizer = CountVectorizer(ngram_range=(2, 3), vocabulary=glove)
counts = count_vectorizer.fit_transform(df['text'].values)
tfidf = transformer.fit_transform(counts)

target = df['label'].values.astype('int')
selector = SelectKBest(chi2, k=1000)
selector.fit(tfidf, target)
top_words = selector.get_support().nonzero()

# Pick only the most informative columns in the data.
chi_matrix = tfidf[:, top_words[0]]

# In[150]:

# Our list of functions to apply.
transform_functions = [
    lambda x: x.count(" ") / len(x.split()),
    lambda x: x.count(".") / len(x.split()),
    lambda x: x.count("!") / len(x.split()),
    lambda x: x.count("?") / len(x.split()),
    lambda x: x.count("-") / len(x.split()),
    lambda x: x.count(",") / len(x.split()),
Esempio n. 38
0
def discriminator(tweet_list, tweet_list_y, count_fake, count_total):
    list_words = ['http', 'https', 'twitter', 'com', 'www']
    count_vectorizer = CountVectorizer(stop_words=list_words)
    count_train = count_vectorizer.fit_transform(X_train)
    count_test = count_vectorizer.transform(tweet_list)

    tfidf_vectorizer = TfidfVectorizer(stop_words=list_words, max_df=0.7)
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)
    tfidf_test = tfidf_vectorizer.transform(X_test)

    clf = SelectKBest(score_func=mutual_info_classif, k=1000)
    fit = clf.fit(count_train, y_train)
    count_x_train_ft = fit.transform(count_train)
    count_x_test_ft = fit.transform(count_test)

    clf = SelectKBest(score_func=mutual_info_classif, k=1000)
    fit = clf.fit(tfidf_train, y_train)
    tfidf_x_train_ft = fit.transform(tfidf_train)
    tfidf_x_test_ft = fit.transform(tfidf_test)

    print("MultinomialNB CountVectorizer")
    mn_count_clf = MultinomialNB()
    mn_count_clf.fit(count_x_train_ft, y_train)
    pred = mn_count_clf.predict(count_x_test_ft)
    score = metrics.accuracy_score(tweet_list_y, pred)
    print("accuracy:   %0.3f" % score)
    print(" ")
    final_score = (score * np.shape(tweet_list)[0] +
                   count_fake) / (np.shape(tweet_list)[0] + count_total)
    print("final_acc: ", final_score)

    print("MultinomialNB TfidfVectorizer")
    mn_tfidf_clf = MultinomialNB()
    mn_tfidf_clf.fit(tfidf_x_train_ft, y_train)
    pred = mn_tfidf_clf.predict(tfidf_x_test_ft)
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    print(" ")
    final_score = (score * np.shape(tweet_list)[0] +
                   count_fake) / (np.shape(tweet_list)[0] + count_total)
    print("final_acc: ", final_score)

    print("PassiveAggressiveClassifier: C = 0.01")
    pa_tfidf_clf = PassiveAggressiveClassifier(max_iter=50, C=0.01)
    pa_tfidf_clf.fit(count_x_train_ft, y_train)
    pred = pa_tfidf_clf.predict(count_x_test_ft)
    score = metrics.accuracy_score(tweet_list_y, pred)
    print("accuracy:   %0.3f" % score)
    print(" ")
    final_score = (score * np.shape(tweet_list)[0] +
                   count_fake) / (np.shape(tweet_list)[0] + count_total)
    print("final_acc: ", final_score)

    print("LinearSVC: C = 0")
    svc_tfidf_clf = LinearSVC()
    svc_tfidf_clf.fit(count_x_train_ft, y_train)
    pred = svc_tfidf_clf.predict(count_x_test_ft)
    score = metrics.accuracy_score(tweet_list_y, pred)
    print("accuracy:   %0.3f" % score)
    print(" ")
    final_score = (score * np.shape(tweet_list)[0] +
                   count_fake) / (np.shape(tweet_list)[0] + count_total)
    print("final_acc: ", final_score)

    print("SGDClassifier")
    sgd_tfidf_clf = SGDClassifier(max_iter=50)
    sgd_tfidf_clf.fit(count_x_train_ft, y_train)
    pred = sgd_tfidf_clf.predict(count_x_test_ft)
    score = metrics.accuracy_score(tweet_list_y, pred)
    print("accuracy:   %0.3f" % score)
    print(" ")
    final_score = (score * np.shape(tweet_list)[0] +
                   count_fake) / (np.shape(tweet_list)[0] + count_total)
    print("final_acc: ", final_score)