Ejemplo n.º 1
7
def select_features(X, y):
  from sklearn.feature_selection import SelectPercentile
  from sklearn.feature_selection import f_classif,chi2
  from sklearn.preprocessing import Binarizer, scale

  # First select features based on chi2 and f_classif
  p = 3

  X_bin = Binarizer().fit_transform(scale(X))
  selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, y)
  selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y)

  chi2_selected = selectChi2.get_support()
  chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
  print('Chi2 selected {} features {}.'.format(chi2_selected.sum(),
     chi2_selected_features))
  f_classif_selected = selectF_classif.get_support()
  f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
  print('F_classif selected {} features {}.'.format(f_classif_selected.sum(),
     f_classif_selected_features))
  selected = chi2_selected & f_classif_selected
  print('Chi2 & F_classif selected {} features'.format(selected.sum()))
  features = [ f for f,s in zip(X.columns, selected) if s]
  print (features)
  return features
Ejemplo n.º 2
0
 def univariant_feature_selection(self,method, X, y,percentile):
     
     test=SelectPercentile(method , percentile=percentile).fit(X, y)
     print("The number of feature in ", method, " is: ", (test.get_support().sum()) )
     for i in range(len(self.X_train.columns)):
         if(test.get_support()[i]):
             print(self.X_train.columns[i])
     return  test.get_support()  
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description='Feature Selection') 
    required = parser.add_argument_group('required options') 
    
    required.add_argument('-x', '--scaledfeaturelist', required=True, help='File containing feature values') 
    required.add_argument('-y', '--targetdata', required=True, help='File containiing target data')
    required.add_argument('-z', '--fetpercentile', required=True, type=int, help='Percentile to select highest scoring percentage of features')
    
    args = parser.parse_args()

    X = np.loadtxt(args.scaledfeaturelist) 
    Y = np.genfromtxt(args.targetdata,dtype='str')
   
    #result = SelectPercentile(f_classif, percentile=args.fetpercentile).fit_transform(X,Y)
    sel = SelectPercentile(f_classif, percentile=args.fetpercentile)
    result = sel.fit_transform(X,Y)
    
    #selecting features for test programs
    if os.path.isfile('variancefeatures.txt'):
        varianceFeature = np.genfromtxt("variancefeatures.txt", dtype='str')
        featureFromSelectPercentile = sel.get_support(indices=True)
        featureFileforSelectPercentile = open("featuresToTestPrograms","w")
        for i in featureFromSelectPercentile:
            featureFileforSelectPercentile.write(varianceFeature[i])
            featureFileforSelectPercentile.write("\n")
        featureFileforSelectPercentile.close()   
    #remove the variancefeatures as we don't need it anymore
    rm variancefeatures.txt

    np.savetxt('featurelist', result, fmt='%.2f', delimiter='\t')
Ejemplo n.º 4
0
def test_select_percentile_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the percentile heuristic
    """
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(
        f_regression, mode='percentile', param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
    # Check inverse_transform respects dtype
    assert_array_equal(X_2.astype(bool),
                       univariate_filter.inverse_transform(X_r.astype(bool)))
def final_feature_set_reduction(reduced_feature_file_name, final_file_name, train_label_file):
    sorted_train_data = pd.read_csv('data/' + reduced_feature_file_name)
    y = []
    X = sorted_train_data.iloc[:,1:]
    fip = open('data/' + train_label_file)
    lines = fip.readlines()
    for line in lines:
        line = line.rstrip()
        y.append(int(line))

    print("Final feature reduction: {:s}".format(reduced_feature_file_name))
    print("Training labels length: {:d}".format(len(y)))
    print("X Feature set dimensionality: {:d} {:d}".format(X.shape[0], X.shape[1]))
    print("In Feature set dimensionality: {:d} {:d}".format(sorted_train_data.shape[0], sorted_train_data.shape[1]))

    # find the top 10 percent variance features, from ~1000 -> ~100 features
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    print("Final 10 Percent Dimensions: {:d} {:d}".format(X_new_10.shape[0], X_new_10.shape[1]))
    
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1

    #data_reduced = sorted_train_data.iloc[:,[0] + selected_names]
    #Does not put the file_name as the first column.
    data_trimmed = sorted_train_data.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_train_data['file_name'])
    data_reduced = data_fnames.join(data_trimmed)
    
    data_reduced.to_csv('data/' + final_file_name, index=False)
    print("Completed reduction in {:s}".format(final_file_name))
    
    return
Ejemplo n.º 6
0
def selectFeatures(features, labels, features_list):
    '''
    Select features according to the 20th percentile of the highest scores. 
    Return a list of features selected  and a dataframe showing the ranking 
    of each feature related to their p values
    features: numpy array with the features to be used to test sklearn models
    labels: numpy array with the real output 
    features_list: a list of names of each feature
    '''
    #feature selection
    selector = SelectPercentile(f_classif, percentile=20)
    selector.fit(features, labels)
    features_transformed = selector.transform(features)
    #filter names to be returned
    l_rtn = [x for x, t in zip(features_list, 
        list(selector.get_support())) if t]
    # pd.DataFrame(features_transformed, columns = l_labels2).head()
    #calculate scores
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()
    df_rtn = pd.DataFrame(pd.Series(dict(zip(features_list,scores))))
    df_rtn.columns = ["pValue_Max"]
    df_rtn = df_rtn.sort("pValue_Max", ascending=False)
    # df_rtn["different_from_zero"]=((df!=0).sum()*1./df.shape[0])


    return l_rtn, df_rtn
Ejemplo n.º 7
0
def test_select_percentile_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Ejemplo n.º 8
0
def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert_true(sparse.issparse(X_r2inv))
    support_mask = safe_mask(X_r2inv, support)
    assert_equal(X_r2inv.shape, X.shape)
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert_equal(X_r2inv.getnnz(), X_r.getnnz())
Ejemplo n.º 9
0
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Ejemplo n.º 10
0
def test(X, y):
       
    ###############################################################################
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 10% most significant features
    selector = SelectPercentile(f_regression, percentile=20)
    selector.fit(X, y)
    print [zero_based_index for zero_based_index in list(selector.get_support(indices=True))]
Ejemplo n.º 11
0
 def feature_selection(self,mode='F'):
     
     print 'Feature Selection...'
     print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     X=self.train.copy()
     y=self.train_label['label'].values.copy()
     
     test=self.test.copy()
     
     if mode.upper()=='M':
         mi=mutual_info_classif(train.values,train_label['label'].values)
     elif mode.upper()=='F':
         F,pval=f_classif(train.values,train_label['label'].values)
     elif mode.upper()=='C':
         chi,pval=chi2(train.values,train_label['label'].values)
     
     features=self.train.columns.copy()
     
     fs_features=features.copy().tolist()
     
     if mode.upper()=='M':
         fs_V=mi.copy().tolist()
     elif mode.upper()=='F':
         fs_V=F.copy().tolist()
     elif mode.upper()=='C':
         fs_V=chi.copy().tolist()
     
     if mode.upper()=='M':
         selector=SelectPercentile(mutual_info_classif,percentile=80)
     elif mode.upper()=='F':
         selector=SelectPercentile(f_classif,percentile=80)
     elif mode.upper()=='C':
         selector=SelectPercentile(chi2,percentile=80)
         
     X_new=selector.fit_transform(X,y)
     
     selected=selector.get_support()
     
     for i in xrange(len(features)):
         if selected[i]==False:
             t=features[i]
             fs_features.remove(t)
             
     fs_V=np.array(fs_V)
     fs_features=np.array(fs_features)
     
     self.train=pd.DataFrame(X_new,columns=fs_features.tolist())
     self.test=test[fs_features]
     
     self.fs_features=fs_features
     
     feas=pd.DataFrame()
     feas['feature']=fs_features
     
     print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     return X_new,feas
Ejemplo n.º 12
0
def main(path,filename):

	#batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12']
	batchs = ['patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12']
	#batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patronesCircularesByN_2_5','patronesCircularesByN_2_9','patronesCircularesByN_3_9','patronesCircularesByN_5_9','patronesCircularesByN_3_5']
	#batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5']
	percentil = 20
	X = []
	y = []
	lens = []
	load_batch(y,path,'clases',filename) 
	y = [j for i in y for j in i]
	for batch in batchs:
		load_batch(X,path,batch,filename)
		lens.append(len(X[0]))
	
	total = [lens[0]]
	for i in xrange(1,len(lens)):
		total.append(lens[i]-lens[i-1])
	print 'Cantidad de atributos por barch'
	print total
	sp = SelectPercentile(chi2,percentil)
	X_new = sp.fit_transform(X, y)
	sup = sp.get_support(True)
	#print sup
	res = [0]* len(batchs)
	for i in sup:
		for j in xrange(0,len(lens)):
			if i <= lens[j]:
				res[j] +=1
				break
	porcentajes = []
	for i in xrange(0,len(lens)):
		porcentajes.append((1.0*res[i])/total[i])
	print 'Cantidad de variables seleccionas en el'+str(percentil)+'percentil univariado'
	print res

	print 'Porcentaje de variables seleccionas en el'+str(percentil)+'percentil univariado'
	print porcentajes
	
	clf = ExtraTreesClassifier()
	clf = clf.fit(X, y)
	fi = clf.feature_importances_

	res2 = [0]* len(batchs)
	for i in xrange(0,len(fi)):
		for j in xrange(0,len(lens)):
			if i <= lens[j]:
				res2[j] += fi[i]
				break
	print 'Importancia porcentual acumulada de la seleccion multivariada'
	print res2
	porcentajes2 = []
	for i in xrange(0,len(lens)):
		porcentajes2.append((1.0*res2[i])/total[i])

	print 'Importancia porcentual promedio por variable de la seleccion multivariada'
	print porcentajes2
Ejemplo n.º 13
0
def train_type_model():
    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True)
    features = []
    labels = []
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            tokens = [token.lemma for token in parser.parse(query.utterance).tokens]
            n_grams = get_grams_feats(tokens)

            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            correct_notable_types = set(filter(lambda x: x,
                                               [KBEntity.get_notable_type(entity_mid)
                                                for entity_mid in answer_entities]))

            other_notable_types = set()
            for candidate in query.eval_candidates:
                entities = [mid for entity_name in candidate.prediction
                            for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)]
                other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities]))
            incorrect_notable_types = other_notable_types.difference(correct_notable_types)

            for type in correct_notable_types.union(incorrect_notable_types):
                if type in correct_notable_types:
                    labels.append(1)
                else:
                    labels.append(0)
                features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type"))

    with open("type_model_data.pickle", 'wb') as out:
        pickle.dump((features, labels), out)

    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(features)
    feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels)
    vec.restrict(feature_selector.get_support())
    X = feature_selector.transform(X)
    type_scorer = SGDClassifier(loss='log', class_weight='auto',
                                n_iter=1000,
                                alpha=1.0,
                                random_state=999,
                                verbose=5)
    type_scorer.fit(X, labels)
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)
Ejemplo n.º 14
0
def selectFeatures(Model, X, y):
    model = Model()
    fsel = SelectPercentile(score_func=f_classif, percentile=5)
    fsel.fit(X, y)
    arr = fsel.get_support()
    print "features: ", np.where(arr == True)
    plt.hist(model.predict(X))
    plt.hist(y)
    plt.show()
def featureSelection(reduced_features,labels,clnd_features,percentile,n_components,results=False):
    """
        Parameters: 
            reduced_features = Unique feature names in python list after dropping non-numeric
            feaures. 
            labels = ground truth labels for the data points.
            clnd_features = data point features in numpy array format corresponding
            to the labels.
            percentile= the parameter for the SelectPercentile method;
            between 0.0-1.0.
            n_components = the n_components for the pca. 
            results = False returns python list of selected features. If True
            returns the metrics of the feature selectors (F-statistic, and p-values from
            f_classif) and the top 'n' pca component variance measurements. 
    
        Output: 
           Resulting list of feature from the SelectPercentile function and the 
           number of principle components used. If p_results = True then the 
           statistics of the SelectPercentile method using f_classif will be printed.
           In addition the explained variance of the top 'x' principle components will
           also be printed.
    """
    from sklearn.feature_selection import SelectPercentile, f_classif
    from sklearn.decomposition import PCA 
    from itertools import compress
    
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit_transform(clnd_features, labels)
    
    pca = PCA(n_components = n_components)
    pca.fit_transform(clnd_features, labels)
    
    if results == True:
    
        f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\
                         key = lambda x: x[1], reverse=True)
        
        p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\
                        key = lambda x: x[1])
  
        expl_var = pca.explained_variance_ratio_
        
        return f_stat,p_vals,expl_var
    else:
        ## return a boolean index of the retained features 
        retained_features = selector.get_support()
        
        ## index the original features by the boolean index of top x% features 
        ## return a python list of the features to be used for training 
        features_list = list(compress(reduced_features[1:],retained_features))
    
        ## add back in the 'poi' to the first position in the final features list
        features_list.insert(0,'poi')
        
        return features_list
def univariate_feature_selection(dataset, features):
	# load the dataset
	spreadsheet = Spreadsheet('../../Downloads/ip/project data.xlsx')
	data = Data(spreadsheet)
	targets = data.targets


	X = dataset
	y = data.targets


	###############################################################################
	plt.figure(1)
	plt.clf()

	X_indices = np.arange(X.shape[-1])

	###############################################################################
	# Univariate feature selection with F-test for feature scoring
	# We use the default selection function: the 10% most significant features
	selector = SelectPercentile(f_classif, percentile=10)
	selector.fit(X, y)
	scores = -np.log10(selector.pvalues_)
	scores /= scores.max()
	plt.bar(X_indices - .45, scores, width=.2,
	        label=r'Univariate score ($-Log(p_{value})$)', color='g')

	###############################################################################
	# Compare to the weights of an SVM
	clf = svm.SVC(kernel='linear')
	clf.fit(X, y)

	svm_weights = (clf.coef_ ** 2).sum(axis=0)
	svm_weights /= svm_weights.max()

	plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='r')

	clf_selected = svm.SVC(kernel='linear')
	clf_selected.fit(selector.transform(X), y)

	svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
	svm_weights_selected /= svm_weights_selected.max()

	plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
	        width=.2, label='SVM weights after selection', color='b')


	x = np.arange(0, len(features))
	plt.title("Comparing feature selection")
	plt.xlabel('Feature number')
	plt.xticks(x, features, rotation=45)
	plt.yticks(())
	#plt.axis('tight')
	plt.legend(loc='upper right')
	plt.show()
def PredictionScore (X_train,X_test,y_train,y_test,header):

    outFile = open('output.txt', 'a')

    from sklearn.svm import SVC
    from sklearn.feature_extraction import DictVectorizer
    vec = DictVectorizer()


    names = ["Linear SVM","Nearest Neighbors",  "RBF SVM", "Decision Tree",
     "Random Forest", "AdaBoost", "Naive Bayes"]
    # names = ["Linear SVM","Linear SVM","Linear SVM","Linear SVM"]

    classifiers = [
    SVC(kernel="linear", C=0.025),
    KNeighborsClassifier(3),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB()]
    # classifiers = [
    # SVC(kernel="linear", C=0.025),
    # SVC(kernel="linear", C=0.02),
    # SVC(kernel="linear", C=0.01)
    # ]

    for name, clf in zip(names, classifiers):
        try:
            accuracy = 0.0

            vec = DictVectorizer()
            fit = vec.fit(X_train)
            select = SelectPercentile(score_func=chi2,percentile=10).fit(fit.transform(X_train),y_train)
            fit.restrict (select.get_support())
            X_train_counts = fit.transform(X_train)
            X_test_counts = fit.transform(X_test)
            # clf = SVC(kernel="linear", C=0.025)
            try:
                clf.fit(X_train_counts.toarray(), y_train)
                #predict = clf.predict(X_test_counts.toarray())
                accuracy += clf.score(X_test_counts.toarray(),y_test)
                # coef = clf._get_coef()
               # print(np.argsort(coef)[-20:])
                #for i in range(0,len(X_test)):
                    #print (X_test[i]['ID']+"\t"+y_test[i]+"\t"+predict[i])
            except BaseException as b:
                    print (b)
            print (name+"\t"+"\t"+str(accuracy))
            outFile.write(name+"\t"+"\t"+str(accuracy)+"\n")
        except BaseException as b:
            print (b)
    outFile.close()
Ejemplo n.º 18
0
def fTestFeatureSelection(train_files, train_labels, test_files, test_labels):
    design_matrix, features, _ = vectorizeTrain(train_files, None, 0, False, 0, None)
    classifier = LogisticRegression()
    for p in range(10):
        percentile = 100-p*10
        print 'Selecting {0}% of features'.format(percentile)
        feat_sel = SelectPercentile(f_regression, percentile)
        X_sel = feat_sel.fit_transform(design_matrix, train_labels)
        f_inds = feat_sel.get_support(indices=True)
        print 'Using {0} features'.format(len(f_inds))
        classifier.fit(X_sel, train_labels)
        test(test_files, test_labels, classifier, [features[d] for d in f_inds], None, 0, False, 0, None)
Ejemplo n.º 19
0
def test_select_percentile_regression_full():
    # Test whether the relative univariate feature selection
    # selects all features when '100%' is asked.
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=100)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=100).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.ones(20)
    assert_array_equal(support, gtruth)
Ejemplo n.º 20
0
def select_with_chi2_and_f_classif(train):
  p = 3
  train = train.drop('ID', 1)
  train_y = train['TARGET']
  train_X = train.drop('TARGET', 1)

  X_bin = Binarizer().fit_transform(scale(train_X))
  selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, train_y)
  selectF_classif = SelectPercentile(f_classif, percentile=p).fit(train_X, train_y)

  chi2_selected = selectChi2.get_support()
  chi2_selected_features = [ f for i,f in enumerate(train_X.columns) if chi2_selected[i]]
  print('Chi2 отобрал {} признаков {}.'.format(chi2_selected.sum(),
     chi2_selected_features))
  f_classif_selected = selectF_classif.get_support()
  f_classif_selected_features = [ f for i,f in enumerate(train_X.columns) if f_classif_selected[i]]
  print('F_classif отобрал {} признаков {}.'.format(f_classif_selected.sum(),
     f_classif_selected_features))
  selected = chi2_selected & f_classif_selected
  print('Chi2 & F_classif отобрали {} признаков'.format(selected.sum()))
  features = [ f for f,s in zip(train_X.columns, selected) if s]
  return features
Ejemplo n.º 21
0
 def getFeatures(self, number_of_features=10):
  # X = self.training.iloc[:,:,-1]
  y = self.training['TARGET']
  X = self.training.drop(['TARGET'], axis=1)
  #Select features according to the k highest scores.
  #selectFeatures = SelectBest(chi2, k= number_of_features)
  #Select the best 10 percentile
  # We can use other classifier as well for Selection like chi2
  selectFeatures = SelectPercentile(f_classif, percentile= number_of_features)
  selectFeatures.fit(X, y)
  # X_select = selectFeatures.transform(X)
  features = selectFeatures.get_support(indices=True)
  # print("Best feature: "+ features[0])
  return(features) 
Ejemplo n.º 22
0
def test_select_percentile_4():
    """Ensure that the TPOT select percentile outputs the same result as sklearn select percentile when 0 < percentile < 100"""
    tpot_obj = TPOT()
    non_feature_columns = ['class', 'group', 'guess']
    training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
    training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=UserWarning)
        selector = SelectPercentile(f_classif, percentile=42)
        selector.fit(training_features, training_class_vals)
        mask = selector.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

    assert np.array_equal(tpot_obj._select_percentile(training_testing_data, 42), training_testing_data[mask_cols])
Ejemplo n.º 23
0
def get_top_chi2_candidate_ngrams(queries, f_extractor, percentile):
    """Get top ngrams features according to chi2.
    """
    ngrams_dict = dict()
    features, labels = construct_examples(queries, f_extractor)
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(features)
    # ch2 = SelectKBest(chi2, k=n_features)
    ch2 = SelectPercentile(chi2, percentile=percentile)
    ch2.fit(X, labels)
    indices = ch2.get_support(indices=True)
    for i in indices:
        ngrams_dict[vec.feature_names_[i]] = 1
    return ngrams_dict
Ejemplo n.º 24
0
    def featureSelection(self,X,y):
        '''
         Feature selection recursive feature elimination with Linear SVM.
        :param:
         a. X the training matrix.
         b. y the labels column corresponding the X.
        :return: 
            a. The mask of top 10% features using.
            b. The transformed training matrix
        '''

        print np.shape(X)

        selector = SelectPercentile(chi2, percentile=10)
        X_new = selector.fit_transform(X, y)
        
        return X_new, selector.get_support()
def feature_reduction_percent(percentage, train_data_df, train_labels_df):
    # TODO: everythong
    X = train_data_df.iloc[:,1:]
    y = np.array(train_labels_df.iloc[:,1])

    # find the top percent variance features.
    fsp = SelectPercentile(chi2, percentage)
    
    X_reduced = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_train_data.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_train_data['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    data_reduced.to_csv('data/sorted-train-malware-features-asm-50percent.csv', index=False)

    return
Ejemplo n.º 26
0
def determine_percentile():
    max_snr = list()
    for i in np.arange(5, 20):
        select = SelectPercentile(chi2, percentile=i)
        select.fit(np.abs(traces), np.reshape(Y, (Queries,)))
        indexes = select.get_support(True)
        print(indexes)
        filter_traces = np.zeros(np.shape(traces), np.complex128)
        filter_traces[:, indexes] = traces[:, indexes]
        filter_traces = np.fft.ifft(filter_traces, n=SAMPLES, axis=1)
        snr_t = np.abs(SNR.SNR(filter_traces, Y, 256, SAMPLES, np.complex128))
        max_snr.append(np.max(snr_t[300:1300]))
    fig, ax = plt.subplots()
    ax.plot(np.arange(10, 20), max_snr)
    ax.set_title("max SNR vs feature selection FFT percentile")
    ax.set_xlabel("percentile")
    plt.show()
Ejemplo n.º 27
0
def best_features(train, test, perc):
    temp_trans = OrdinalEncoder(dtype='int')
    train[['protocol_type', 'service', 'flag',
           'target']] = temp_trans.fit_transform(
               train[['protocol_type', 'service', 'flag', 'target']])
    trans = SelectPercentile(f_classif, percentile=perc)
    trans.fit(train.drop('target', axis='columns'), train['target'])
    train[['protocol_type', 'service', 'flag',
           'target']] = temp_trans.inverse_transform(
               train[['protocol_type', 'service', 'flag', 'target']])
    eliminated_columns = trans.get_support()
    bad_features = []
    for i in range(len(eliminated_columns)):
        if not eliminated_columns[i]:
            bad_features.append(train.columns[i])
    train.drop(bad_features, axis='columns', inplace=True)
    test.drop(bad_features, axis='columns', inplace=True)
    return train, test
Ejemplo n.º 28
0
def percentile_k_features(df, k=20):
    y = df['SalePrice']
    X = df.loc[:, data.columns != 'SalePrice']

    kpsec = SelectPercentile(score_func=f_regression, percentile=k)
    percentileCols = kpsec.fit_transform(X, y)
    getIndices = np.asarray(kpsec.get_support(indices=True))
    scores = kpsec.scores_
    sorted_scores = np.argsort(scores)[::-1]

    list_cols = []
    for ind in sorted_scores:
        if (ind in getIndices):
            list_cols.append(X.columns[ind])

    #print(list_cols)

    return list_cols
Ejemplo n.º 29
0
def get_combined_separate_fsets(feature_sets, fs_fn='pct', ptile=10, nFeatures=5, score_fn=f_classif):
    df_lst = []
    for fset_name, df in feature_sets.items():
        X_train = df[df.partition == 'train'].drop(['partition', 'fatality_ind'], axis=1)
        y_train = df[df.partition == 'train'].fatality_ind
        df_X = df.drop(['partition', 'fatality_ind'], axis=1)
        if fs_fn == 'pct':
            featureSelector = SelectPercentile(score_func=score_fn, percentile=ptile)
        else:
            featureSelector = SelectKBest(score_func=score_fn, k=nFeatures)
        featureSelector.fit(X_train, y_train)
        fs = featureSelector.transform(df.drop(['partition', 'fatality_ind'], axis=1))
        cols_fs = df_X.columns[list(featureSelector.get_support(indices=True))]
        cols_fs_ref = [fset_name + ' ' + c for c in cols_fs]
        df_fs = pd.DataFrame(fs, index=df_X.index, columns=cols_fs_ref)
        df_lst.append(df_fs)
    df_comb = df[['partition', 'fatality_ind']].join(pd.concat(df_lst, axis=1))
    return df_comb
def test_select_percentile_regression_full():
    # Test whether the relative univariate feature selection
    # selects all features when '100%' is asked.
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           shuffle=False,
                           random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=100)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                                   param=100).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.ones(20)
    assert_array_equal(support, gtruth)
Ejemplo n.º 31
0
def addTagsMatrix(df, **params):
    '''
    将用户标签转换成稀疏矩阵
    '''
    startTime = datetime.now()
    cv = CountVectorizer(min_df=0.001, max_df=0.8, binary=True)
    cv.fit(df['user_tags'].dropna())
    tagSelecter = SelectPercentile(chi2, percentile=10)
    tagSelecter.fit(cv.transform(df[df.flag >= 0]['user_tags'].fillna("")),
                    df[df.flag >= 0]['click'])
    tagsMatrix = tagSelecter.transform(cv.transform(
        df['user_tags'].fillna("")))
    tagsName = np.array(cv.get_feature_names())[tagSelecter.get_support()]
    tempDf = pd.DataFrame(tagsMatrix.toarray(),
                          columns=['tag_' + str(x) for x in tagsName],
                          index=df.index)
    df = pd.concat([df, tempDf], axis=1)
    print('tag matrix time:', datetime.now() - startTime)
    return df
Ejemplo n.º 32
0
def test_select_percentile_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(n_samples=200, n_features=20,
                               n_informative=3, n_redundant=2,
                               n_repeated=0, n_classes=8,
                               n_clusters_per_class=1, flip_y=0.0,
                               class_sep=10, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='percentile',
                                   param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Ejemplo n.º 33
0
def data():
    d1 = inputParser("data-2018-01-14-neworleans.csv")
    d2 = inputParser("data-2018-04-01-birmingham.csv")
    splitData(d1)
    splitData(d2)

    eliminator = SelectPercentile(mutual_info_classif, percentile=30)
    #eliminator = SelectKBest(mutual_info_classif, k=1)
    newDataMat = eliminator.fit_transform(dataMat, classLabel)
    used = (eliminator.get_support())
    output = "["
    for x in range(len(used)):
        if used[x]:
            output += colLabel[x + len(rowId[0])] + ", "
    print("factors used:")
    print(output[0:-2] + "]")
    print()
    filter(used)
    return [colLabelRelevent, dataRelevant, classLabel]
Ejemplo n.º 34
0
def select_features(X, y, keep_percentage, features_outfile=None):
    feature_names = X.columns
    feature_finder = SelectPercentile(chi2, percentile=keep_percentage)
    X = feature_finder.fit_transform(X, y)
    support = feature_finder.get_support()
    feature_names = feature_names[support]
    if features_outfile is not None:
        scores = feature_finder.scores_
        pvals = feature_finder.pvalues_
        feature_scores = scores[support]
        feature_pvals = pvals[support]
        feature_data = zip(feature_names, feature_scores, feature_pvals)
        # Sort features by score.
        ranked = sorted(feature_data, key=lambda x: x[1], reverse=True)
        with open(features_outfile, 'w') as outF:
            for feat in ranked:
                outF.write("{} :: {:g} :: {:g}\n".format(*feat))
        print("Saved features to {}".format(features_outfile))
    return X, feature_names
    def get_drop_columns_on_percentile_based_feature_selection(
            train, percent, all_cols):
        print('Percentile based feature selection:', percent)
        X = train.drop(["isFraud", 'TransactionID', 'TransactionDT'], axis=1)
        y = train["isFraud"]

        selector_f = SelectPercentile(f_classif, percentile=percent)
        X_best = selector_f.fit_transform(X, y)
        support = np.asarray(selector_f.get_support())
        #top 20% features
        features = np.asarray(X.columns.values)
        features_with_support = features[support]
        #top 20% f-scores
        fscores = np.asarray(selector_f.scores_)
        fscores_with_support = fscores[support]
        #top 20% p-values
        pvalues = np.asarray(selector_f.pvalues_)
        pvalues_with_support = fscores[support]

        top_features = pd.DataFrame(
            {
                "F-Score": fscores_with_support,
                "P-Value": pvalues_with_support
            },
            index=features_with_support)
        print(
            "Top features best associated features with Y\n Number of features",
            len(features_with_support))
        print(
            top_features.sort_values(by='P-Value',
                                     ascending=True,
                                     inplace=True))
        print('Done!')
        final_features = top_features.index.values.tolist()
        droppable_cols = []
        for j in range(len(all_cols)):
            if all_cols[j] not in final_features:
                droppable_cols.append(all_cols[j])

        print(len(droppable_cols))
        print("Columns to drop:\n", droppable_cols)
        return droppable_cols
    def fit(self, X, y):
        '''
        Inputs:
        -------
        X: a dataframe
        y: a series
        '''
        relevance = SelectPercentile(f_classif, percentile=self.percentile)
        feature_relevant = relevance.fit_transform(X, y.values.ravel())

        idx_most_relevant = relevance.get_support()
        names_most_relevant = X.columns[idx_most_relevant]

        scores = -np.log10(relevance.pvalues_[idx_most_relevant])
        scores /= scores.max()

        self.scores = scores
        self.relevant_features = names_most_relevant

        pass
def percentile_k_features(df, k=20):
#     X = df.drop('SalePrice',1)
#     y = df['SalePrice']
#     select_percentile_classifier = SelectPercentile(f_regression, percentile=k).fit(X, y)

#     mask = select_percentile_classifier.get_support() #list of booleans
#     new_features = [] 

#     for bool, feature in zip(mask, X.columns):
#         if bool:
#             new_features.append(feature)
            
    #alternate code
    x = data.iloc[:,:-1]
    y = data.iloc[:,-1]
    a = SelectPercentile(f_regression, percentile = 20).fit(x,y)
    # return a[2]
    ids = a.get_support(indices = True)
    k_features = data.iloc[:,ids].columns
    expected = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath']
    return expected
Ejemplo n.º 38
0
def test_select_percentile_4():
    """Ensure that the TPOT select percentile outputs the same result as sklearn select percentile when 0 < percentile < 100"""
    tpot_obj = TPOT()
    non_feature_columns = ['class', 'group', 'guess']
    training_features = training_testing_data.loc[
        training_testing_data['group'] == 'training'].drop(non_feature_columns,
                                                           axis=1)
    training_class_vals = training_testing_data.loc[
        training_testing_data['group'] == 'training', 'class'].values

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=UserWarning)
        selector = SelectPercentile(f_classif, percentile=42)
        selector.fit(training_features, training_class_vals)
        mask = selector.get_support(True)
    mask_cols = list(
        training_features.iloc[:, mask].columns) + non_feature_columns

    assert np.array_equal(
        tpot_obj._select_percentile(training_testing_data, 42),
        training_testing_data[mask_cols])
Ejemplo n.º 39
0
def test_select_percentile_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the percentile heuristic
    """
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                    param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
Ejemplo n.º 40
0
Archivo: tpot.py Proyecto: vsolano/tpot
    def _select_percentile(self, input_df, percentile):
        """Uses Scikit-learn's SelectPercentile feature selection to learn the subset of features that belong in the highest `percentile`
        according to a given scoring function

        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to perform feature selection on
        percentile: int
            The features that belong in the top percentile to keep from the original set of features in the training data

        Returns
        -------
        subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the best features in the given `percentile`

        """
        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)
        training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values

        if percentile < 0:
            percentile = 0
        elif percentile > 100:
            percentile = 100

        if len(training_features.columns.values) == 0:
            return input_df.copy()

        with warnings.catch_warnings():
            # Ignore warnings about constant features
            warnings.simplefilter('ignore', category=UserWarning)

            selector = SelectPercentile(f_classif, percentile=percentile)
            selector.fit(training_features, training_class_vals)
            mask = selector.get_support(True)

        mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group']
        return input_df[mask_cols].copy()
Ejemplo n.º 41
0
def PreProcessing(train, test, relation, prun_off_threshold, percent, ex_fac):
    #including instance weighting, necessary data preparation
    # assign label for each kind of relation, compose balanced data set
    train, instance_weight = Data_preparation(train, True, ex_fac, relation)
    test = Data_preparation(test, False, ex_fac, relation)
    #print "data preparation done"

    # convert feature list to feature string, for both baseline feature and my own feature.
    X_train, y_train = [x["features"]
                        for x in train], [x["Sense"] for x in train]
    X_test, y_test = [x["features"] for x in test], [
        x["clf_label"] for x in test
    ]  # y_test is actually useless, I put it here just to fit the nltk data structure
    #print "feature string done"

    vectorizer = CountVectorizer(min_df=prun_off_threshold,
                                 token_pattern='[^ ]{1,}',
                                 binary=True,
                                 lowercase=False)
    vectorizer.fit_transform(X_train)
    X_train_vec = vectorizer.transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    #print "data transformation done"

    #here, we can do some feature selection
    selection = SelectPercentile(chi2, percent).fit(X_train_vec, y_train)
    X_train_selected = selection.transform(X_train_vec)
    X_test_selected = selection.transform(X_test_vec)
    selected_index = selection.get_support(True)
    #print "feature selection done"

    feature_list = vectorizer.get_feature_names()
    selected_feature_list = [feature_list[x] for x in selected_index]

    #print "original data scale: "+ str(X_train_vec.shape)
    #print "feature dimension after selection: "+ str(len(selected_feature_list))
    #155570
    return X_train_selected, y_train, X_test_selected, instance_weight, X_train, X_test, y_test, selected_feature_list, test
def tfidf_features(data,
                   y,
                   keep_percentage=100,
                   ngram_range=(1, 1),
                   binary=False):
    sentences = data["SENTENCE"].values
    y = y.ravel()
    vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                                 stop_words="english",
                                 token_pattern=r'(?u)\b[\w-][\w-]+\b',
                                 binary=binary)
    X = vectorizer.fit_transform(sentences).toarray()
    feature_names = np.array(vectorizer.get_feature_names())
    print(X.shape)
    if keep_percentage < 100:
        feature_finder = SelectPercentile(f_classif,
                                          percentile=keep_percentage)
        X = feature_finder.fit_transform(X, y)
        print("After feature selection: {}".format(X.shape))
        support = feature_finder.get_support()
        feature_names = feature_names[support]
    feature_names = ['"{}"'.format(f) for f in feature_names]
    return X, feature_names
Ejemplo n.º 43
0
def final_feature_set_reduction(reduced_feature_file_name, final_file_name,
                                train_label_file):
    sorted_train_data = pd.read_csv('data/' + reduced_feature_file_name)
    y = []
    X = sorted_train_data.iloc[:, 1:]
    fip = open('data/' + train_label_file)
    lines = fip.readlines()
    for line in lines:
        line = line.rstrip()
        y.append(int(line))

    print("Final feature reduction: {:s}".format(reduced_feature_file_name))
    print("Training labels length: {:d}".format(len(y)))
    print("X Feature set dimensionality: {:d} {:d}".format(
        X.shape[0], X.shape[1]))
    print("In Feature set dimensionality: {:d} {:d}".format(
        sorted_train_data.shape[0], sorted_train_data.shape[1]))

    # find the top 10 percent variance features, from ~1000 -> ~100 features
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X, y)
    print("Final 10 Percent Dimensions: {:d} {:d}".format(
        X_new_10.shape[0], X_new_10.shape[1]))

    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1

    #data_reduced = sorted_train_data.iloc[:,[0] + selected_names]
    #Does not put the file_name as the first column.
    data_trimmed = sorted_train_data.iloc[:, selected_names]
    data_fnames = pd.DataFrame(sorted_train_data['file_name'])
    data_reduced = data_fnames.join(data_trimmed)

    data_reduced.to_csv('data/' + final_file_name, index=False)
    print("Completed reduction in {:s}".format(final_file_name))

    return
Ejemplo n.º 44
0
class UnivariateFeatureSelection:
    def __init__(self, n_features, problem_type, scoring):
        if problem_type == "classification":
            valid_scoring = {
                "f_classifs": f_classif,
                "chi2": chi2,
                "mutual_info_classif": mutual_info_classif
            }
        elif problem_type == "regression":
            valid_scoring = {
                "f_regression": f_regression,
                "mutual_info_regression": mutual_info_regression
            }

        if scoring not in valid_scoring:
            raise Exception(
                f"Invalid scoring. Options are: {valid_scoring.keys()}")

        if isinstance(n_features, int):
            self.selection = SelectKBest(valid_scoring[scoring], k=n_features)
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(valid_scoring[scoring],
                                              percentile=int(n_features * 100))
        else:
            raise Exception("Invalid n_features. It should be float or int.")

    def fit(self, X, y):
        return self.selection.fit(X, y)

    def transform(self, X):
        return self.selection.transform(X)

    def fit_transform(self, X, y):
        return self.selection.fit_transform(X, y)

    def get_support(self):
        return self.selection.get_support()
Ejemplo n.º 45
0
 def train(self,
           train_data,
           train_labels,
           classes,
           feature_selection=False,
           percentile=100,
           batch_size=1000):
     if feature_selection:
         selector = SelectPercentile(chi2, percentile=percentile)
         X = selector.fit_transform(
             self.vectorizer.fit_transform(train_data), train_labels)
         new_vocab = list(
             np.array(self.vectorizer.vocabulary)[selector.get_support()])
         self.vectorizer = TfidfVectorizer(dtype=np.float32,
                                           vocabulary=new_vocab)
     print(len(self.vectorizer.vocabulary))
     for i in range(0, train_data.size, batch_size):
         print(i)
         data = train_data[i:i + batch_size]
         X = self.vectorizer.fit_transform(data).toarray()
         # self.clf.partial_fit(X, train_labels[i:i+batch_size], classes=classes)
         self.clf.partial_fit(X,
                              train_labels[i:i + batch_size],
                              classes=classes)
Ejemplo n.º 46
0
def learning_of_bread(data):
    # 对星级进行二值化
    data[['Star']] = pre.Binarizer(threshold=39).transform(data[['Star']])
    # print('Star二值化结果:\n', data['Star'].values)
    # 将菜系类别变量转换成数值变量
    data['Cuisine'] = pre.LabelEncoder().fit_transform(data['Cuisine'])
    # print('菜系变量转数值变量:\n', data['Cuisine'].values)
    # 选取特征和标签
    features = data[[
        'Cuisine', 'Comments', 'Per_Consumption', 'Taste', 'Environment',
        'Service'
    ]].values
    label = data['Star'].values
    # 选取重要性特征
    fea_select = SP(percentile=85)
    fea_select.fit(features, label)
    # print(fea_select.get_support())
    # print(fea_select.scores_)
    fea_new = features[:, fea_select.get_support()]
    # print(fea_new)
    # 特征归一化处理
    stand_fea = pre.MinMaxScaler().fit_transform(fea_new)
    # print(stand_fea)
    return stand_fea, label
Ejemplo n.º 47
0
def features_selection(x_train, y_train, featurs_selection, percent):
    # 选择特征
    vectorizer = TfidfVectorizer()
    vectors_train = vectorizer.fit_transform(x_train)
    features_names = vectorizer.get_feature_names()
    if percent == 1.0:
        return features_names
    # num_features_selected = int(vectors_train.shape[1] * 0.05)
    if featurs_selection in ['chi2', 'mutual_info_classif']:
        selection = SelectPercentile(eval(featurs_selection),
                                     percentile=int(percent * 100))
        selection.fit(vectors_train, y_train)
        features_names_selected =\
            [features_names[k]
             for k in selection.get_support(indices=True)]

    elif featurs_selection in ['WLLR', 'IG', 'MI']:
        features_names_selected = prepocessing_bugs.feature_selection(
            [doc.split() for doc in x_train], y_train, featurs_selection,
            percent)

    print('sklearn select features: %d' % len(features_names_selected))
    print(features_names_selected[:10])
    return features_names_selected
Ejemplo n.º 48
0
    def select_features(self, method):
        if method == 'SelectPercentile':
            select = SelectPercentile(percentile=50)
        elif method == 'SelectKBest':
            select = SelectKBest(chi2, k=2)
        elif method == 'VarianceThreshold':
            select = VarianceThreshold(threshold=(.8 * (1 - .8)))
        elif method == 'TreeBased':
            select = SelectFromModel(RandomForestClassifier(), threshold='median')
        elif method == 'L1Based':
            select = SelectFromModel(LinearSVC(C=0.01, penalty='l1', dual=False), threshold='median')
        else:
            sys.exit('Method name not valid')

        # Fit the selector
        select.fit(self.data, self.labels)
        # Apply to features
        self.data = select.transform(self.data)
        print('Feature selection using %s method:' % method)
        mask = select.get_support()
        print(mask)
        self.mu = self.mu[mask]
        self.sigma = self.sigma[mask]
        self.features = self.features[mask]
Ejemplo n.º 49
0
### Used feature selection thru SelectBestK and SelectPercentile methods.
### I used f_classif since I used features and labes and classification models.

X = features
y = labels
K = 5
P = 50

print "ORIGINAL FEATURES:"
print features_list[1:]
print
print "INTELLIGENT FEATURE SELECTION:"
selector1 = SelectPercentile(f_classif, percentile=P)
selector1.fit(X, y)
mask1 = selector1.get_support()
new_features1 = []
for bool, feature in zip(mask1, features_list[1:]):
    if bool:
        new_features1.append(feature)
print "SelectPercentile:(percentaje=50)", new_features1

selector2 = SelectKBest(f_classif, k=K)
selector2.fit(X, y)
mask2 = selector2.get_support()
new_features2 = []
for bool, feature in zip(mask2, features_list[1:]):
    if bool:
        new_features2.append(feature)
print "SelectKBest(k=5):", new_features2
Ejemplo n.º 50
0
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))
X_w_noise = np.hstack([cancer.data, noise])
X_train, X_test, y_train, y_test = train_test_split(X_w_noise,
                                                    cancer.target,
                                                    random_state=0,
                                                    test_size=.5)

select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
X_train_selected = select.transform(X_train)

mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
plt.show()

from sklearn.linear_model import LogisticRegression
X_test_selected = select.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train, y_train)
print("All features: {:.3f}".format(lr.score(X_test, y_test)))
lr.fit(X_train_selected, y_train)
print("Selected features: {:.3f}".format(lr.score(X_test_selected, y_test)))
Ejemplo n.º 51
0
def _univariate_feature_screening(
        X, y, mask, is_classif, screening_percentile, smoothing_fwhm=2.):
    """
    Selects the most import features, via a univariate test

    Parameters
    ----------
    X : ndarray, shape (n_samples, n_features)
        Design matrix.

    y : ndarray, shape (n_samples,)
        Response Vector.

    mask: ndarray or booleans, shape (nx, ny, nz)
        Mask definining brain Rois.

    is_classif: bool
        Flag telling whether the learning task is classification or regression.

    screening_percentile : float in the closed interval [0., 100.]
        Only the `screening_percentile * 100" percent most import voxels will
        be retained.

    smoothing_fwhm : float, optional (default 2.)
        FWHM for isotropically smoothing the data X before F-testing. A value
        of zero means "don't smooth".

    Returns
    -------
    X_: ndarray, shape (n_samples, n_features_)
        Reduced design matrix with only columns corresponding to the voxels
        retained after screening.

    mask_ : ndarray of booleans, shape (nx, ny, nz)
        Mask with support reduced to only contain voxels retained after
        screening.

    support : ndarray of ints, shape (n_features_,)
        Support of the screened mask, as a subset of the support of the
        original mask.
    """
    # smooth the data (with isotropic Gaussian kernel) before screening
    if smoothing_fwhm > 0.:
        sX = np.empty(X.shape)
        for sample in range(sX.shape[0]):
            sX[sample] = ndimage.gaussian_filter(
                _unmask(X[sample].copy(),  # avoid modifying X
                        mask), (smoothing_fwhm, smoothing_fwhm,
                                smoothing_fwhm))[mask]
    else:
        sX = X

    # do feature screening proper
    selector = SelectPercentile(f_classif if is_classif else f_regression,
                                percentile=screening_percentile).fit(sX, y)
    support = selector.get_support()

    # erode and then dilate mask, thus obtaining a "cleaner" version of
    # the mask on which a spatial prior actually makes sense
    mask_ = mask.copy()
    mask_[mask] = (support > 0)
    mask_ = ndimage.binary_dilation(ndimage.binary_erosion(
        mask_)).astype(np.bool)
    mask_[np.logical_not(mask)] = 0
    support = mask_[mask]
    X = X[:, support]

    return X, mask_, support
Ejemplo n.º 52
0
    def use_pipeline_with_feature_selection(self):

        #####################
        # Build a classifier pipeline and carry out feature selection and grid search for best classif parameters
        #####################

        pipeline = Pipeline([("selector", SelectPercentile()),
                             ('clf', SGDClassifier(random_state=42))])

        # Build a grid search to find the best parameter
        # Fit the pipeline on the training set using grid search for the parameters
        parameters = {
            'selector__score_func': (chi2, f_classif),
            'selector__percentile': (85, 95, 100),
            'clf__loss': ('hinge', 'log'),
            'clf__penalty': ('l2', 'l1', 'elasticnet'),
            'clf__n_iter': (5, 10),
            'clf__alpha': (0.001, 0.0001, 0.0005),
        }

        #################
        # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used
        # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations
        # of parameter values are evaluated and the best combination is retained.
        #################

        cv = StratifiedShuffleSplit(y_train,
                                    n_iter=5,
                                    test_size=0.2,
                                    random_state=42)
        grid_search = GridSearchCV(pipeline,
                                   param_grid=parameters,
                                   cv=cv,
                                   n_jobs=-1)
        clf_gs = grid_search.fit(x_train, y_train)

        ###############
        # print the cross-validated scores for the each parameters set explored by the grid search
        ###############

        best_parameters, score, _ = max(clf_gs.grid_scores_,
                                        key=lambda x: x[1])
        for param_name in sorted(parameters.keys()):
            print("%s: %r" % (param_name, best_parameters[param_name]))

        print("Score for gridsearch is %0.2f" % score)

        ###############
        # run the classifier again with the best parameters
        # in order to get 'clf' for get_important_feature function!
        ###############

        score_func = best_parameters['selector__score_func']
        percentile = best_parameters['selector__percentile']
        loss = best_parameters['clf__loss']
        penalty = best_parameters['clf__penalty']
        alpha = best_parameters['clf__alpha']

        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=score_func,
                                    percentile=percentile)

        combined_features = Pipeline([("feat_select", selector)])

        X_features = combined_features.fit_transform(x_train, y_train)
        X_test_features = combined_features.transform(x_test)

        print("Shape of train data after feature selection is " +
              str(X_features.shape))
        print("Shape of test data after feature selection is " +
              str(X_test_features.shape))

        # run classifier on selected features

        clf = SGDClassifier(loss=loss,
                            penalty=penalty,
                            alpha=alpha,
                            random_state=42).fit(X_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        f = open(path_to_store_feature_selection_boolean_file, 'w')

        for fb in feature_boolean:
            f.write(str(fb) + '\n')

        f.close()

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 X_features,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        ##################
        # run classifier on test data
        ##################

        y_predicted = clf.predict(X_test_features)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(X_test_features, y_test))

        # Print and plot the confusion matrix

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf
Ejemplo n.º 53
0
    def train_classifier_use_feature_selection(self):

        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=_score_func,
                                    percentile=_percentile)

        print("Fitting data with feature selection ...")
        selector.fit(x_train, y_train)

        # get how many features are left after feature selection
        x_features = selector.transform(x_train)

        print("Shape of array after feature selection is " +
              str(x_features.shape))

        clf = SGDClassifier(loss=_loss,
                            penalty=_penalty,
                            alpha=_alpha,
                            n_iter=_n_iter,
                            random_state=42).fit(x_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 x_features,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        ####################
        # test clf on test data
        ####################

        # apply feature selection on test data too

        x_test_selector = selector.transform(x_test)

        print("Shape of array for test data after feature selection is " +
              str(x_test_selector.shape))

        y_predicted = clf.predict(x_test_selector)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(x_test_selector, y_test))

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf
Ejemplo n.º 54
0
x = pd.DataFrame(x.todense(), columns=ft)
del X_test['Voter_name']
X_test = X_test.join(x, rsuffix='2', lsuffix='1')
X_test.fillna(0).to_sparse(fill_value=0)

from sklearn.decomposition import PCA

# pca=PCA(n_components=140)
# pca.fit(X_train)
# X_train=pca.transform(X_train)
# X_test=pca.transform(X_test)
from sklearn.feature_selection import SelectPercentile, f_classif
percentile = SelectPercentile(percentile=20)

X_train = percentile.fit_transform(X_train, Y_train)
selected_features = percentile.get_support(True)

X_test = X_test.iloc[:, selected_features]
print(selected_features)

# import xgboost as xgb

# mod = xgb.XGBClassifier()
# mod.fit(X_train, Y_train)
# print('xgb', mod.score(X_test, Y_test))
# exit(0)

# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#
# mod =AdaBoostClassifier()
# mod.fit(X_train,Y_train)
Ejemplo n.º 55
0
# Import various select transforms along with the f_regression mode
from sklearn.feature_selection import SelectPercentile, f_regression

# Implement and print SelectPercentil we will take 50% of 6 independent
# features i.e., 3
selectorP = SelectPercentile(score_func=f_regression, percentile=50)
x_p = selectorP.fit_transform(X, Y)

# Get f_score and p_values for the selected features
f_score = selectorP.scores_
p_values = selectorP.pvalues_

# Print the f_score and p_values
# Print the table of Features, F-Score and P-values
columns = list(X.columns)

print("\n\n ")
print("    Features     ", "F-Score    ", "P-Values")
print("    -----------  ---------    ---------")

for i in range(0, len(columns)):
    f1 = "%4.2f" % f_score[i]
    p1 = "%2.6f" % p_values[i]
    print("    ", columns[i].ljust(12), f1.rjust(8), "    ", p1.rjust(8))

cols = selectorP.get_support(indices=True)
selectedCols = X.columns[cols].to_list()

print(selectedCols)
X = training.iloc[:,:-1]
y = training.TARGET
X['n0'] = (X == 0).sum(axis=1)

from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale


p = 30

X_bin = Binarizer().fit_transform(scale(X))
selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, y)
selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y)

chi2_selected = selectChi2.get_support()
chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
print('Chi2 selected {} features {}.'.format(chi2_selected.sum(),
   chi2_selected_features))
f_classif_selected = selectF_classif.get_support()
f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
print('F_classif selected {} features {}.'.format(f_classif_selected.sum(),
   f_classif_selected_features))
selected = chi2_selected & f_classif_selected
print('Chi2 & F_classif selected {} features'.format(selected.sum()))
features = [ f for f,s in zip(X.columns, selected) if s]
print (features)

X_sel = X[features]

# X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_sel,
# Compare to the weights of an SVM
clf = svm.SVC(kernel="linear")
clf.fit(X, y)

svm_weights = (clf.coef_ ** 2).sum(axis=0)
svm_weights /= svm_weights.max()

plt.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight", color="navy")

clf_selected = svm.SVC(kernel="linear")
clf_selected.fit(selector.transform(X), y)

svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
svm_weights_selected /= svm_weights_selected.max()

plt.bar(
    X_indices[selector.get_support()] - 0.05,
    svm_weights_selected,
    width=0.2,
    label="SVM weights after selection",
    color="c",
)


plt.title("Comparing feature selection")
plt.xlabel("Feature number")
plt.yticks(())
plt.axis("tight")
plt.legend(loc="upper right")
plt.show()
Ejemplo n.º 58
0
def _univariate_feature_screening(
        X, y, mask, is_classif, screening_percentile, smoothing_fwhm=2.):
    """
    Selects the most import features, via a univariate test

    Parameters
    ----------
    X : ndarray, shape (n_samples, n_features)
        Design matrix.

    y : ndarray, shape (n_samples,)
        Response Vector.

    mask: ndarray or booleans, shape (nx, ny, nz)
        Mask defining brain Rois.

    is_classif: bool
        Flag telling whether the learning task is classification or regression.

    screening_percentile : float in the closed interval [0., 100.]
        Only the `screening_percentile * 100" percent most import voxels will
        be retained.
    %(smoothing_fwhm)s
        Default=2.

    Returns
    -------
    X_: ndarray, shape (n_samples, n_features_)
        Reduced design matrix with only columns corresponding to the voxels
        retained after screening.

    mask_ : ndarray of booleans, shape (nx, ny, nz)
        Mask with support reduced to only contain voxels retained after
        screening.

    support : ndarray of ints, shape (n_features_,)
        Support of the screened mask, as a subset of the support of the
        original mask.
    """
    # smooth the data (with isotropic Gaussian kernel) before screening
    if smoothing_fwhm > 0.:
        sX = np.empty(X.shape)
        for sample in range(sX.shape[0]):
            sX[sample] = ndimage.gaussian_filter(
                _unmask_from_to_3d_array(X[sample].copy(),  # avoid modifying X
                                         mask), (smoothing_fwhm, smoothing_fwhm,
                                                 smoothing_fwhm))[mask]
    else:
        sX = X

    # do feature screening proper
    selector = SelectPercentile(f_classif if is_classif else f_regression,
                                percentile=screening_percentile).fit(sX, y)
    support = selector.get_support()

    # erode and then dilate mask, thus obtaining a "cleaner" version of
    # the mask on which a spatial prior actually makes sense
    mask_ = mask.copy()
    mask_[mask] = (support > 0)
    mask_ = ndimage.binary_dilation(ndimage.binary_erosion(
        mask_)).astype(bool)
    mask_[np.logical_not(mask)] = 0
    support = mask_[mask]
    X = X[:, support]

    return X, mask_, support
Ejemplo n.º 59
0
def train(data):

    test_data = data

    ## aggregates the predictions on classifiers over the four corpuses into one
    target_psfl = test_data.loc[:, test_data.columns == 'psfl']
    target_zh = test_data.loc[:, test_data.columns == 'zh']
    target_wiki = test_data.loc[:, test_data.columns == 'wiki']
    target_brescola = test_data.loc[:, test_data.columns == 'brescola']

    # ensures the lengths of the target data line up
    length = len(target_brescola)
    if length != len(target_zh) or length != len(target_wiki) or length != len(
            target_psfl):
        print("ERROR: Lengths of four targets are not all the same.")

    ## aggregates the results into one column
    difficulties = []

    # for each observation...
    for i in range(length):
        results = [
            target_psfl.values[i], target_zh.values[i], target_wiki.values[i],
            target_brescola.values[i]
        ]

        # takes the maximum occurrence, defaulting towards "difficult" in case of tie
        s = 0
        d = 0
        for res in results:
            if res == 'd':
                d += 1
            elif res == 's':
                s += 1
            else:
                print('ERROR: Target value not \'d\' nor \'s\'.')

        # TO-WRITE:
        # if using the weighted system, tiebreaking in favor of d brings down overall accuracy but balances between labels
        # doing the opposite brings up accuracy and f1 but heavily leans in favor of predicting for 's'
        # just using psfl brings up both scores -> DISCUSS THIS -> why might this be? maybe those docs are all from psfl???

        # TODO - Plot the differences here!
        if d >= s:
            difficulties.append('d')
        else:
            difficulties.append('s')

    ## partitions data into features and target, dropping old target columns as well as docid column
    data = test_data.drop("psfl", axis=1)
    data = data.drop("zh", axis=1)
    data = data.drop("wiki", axis=1)
    data = data.drop("brescola", axis=1)
    data = data.drop("docid", axis=1)

    # sets the target data according to user input
    print(
        'Please select the corpora to use as ground truth: 0 - PSFL, 1 - ZH, 2 - Wiki, 3 - BrEscola, 4 - Weighted Average'
    )
    temp = float(input())

    if (temp == 0):
        target = target_psfl.values.reshape(-1, ).tolist()
    elif (temp == 1):
        target = target_zh.values.reshape(-1, ).tolist()
    elif (temp == 2):
        target = target_wiki.values.reshape(-1, ).tolist()
    elif (temp == 3):
        target = target_brescola.values.reshape(-1, ).tolist()
    elif (temp == 4):
        target = difficulties

    # partitions data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.2)

    # partitions the training set into a general set (for use in the model) and a tuning set (for use in refining other components)
    X_general, X_tuning, y_general, y_tuning = train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.3)

    X_tuning_copy = X_tuning.copy()

    # Uses GridSearchCV to determine the right kernel
    #, degree=[2, 3, 4]
    model = RandomForestClassifier()
    grid = GridSearchCV(estimator=model,
                        param_grid=dict(n_estimators=[10, 50, 100, 200],
                                        criterion=['gini', 'entropy'],
                                        max_depth=[None, 1, 10, 100]))
    grid.fit(X_tuning, y_tuning)

    op_estims = grid.best_estimator_.n_estimators
    op_crit = grid.best_estimator_.criterion
    op_depth = grid.best_estimator_.max_depth

    print('Optimal Params (FOREST):', op_estims, op_crit, op_depth)

    ## performs feature selection using other scikit libraries

    # TODO - Average all metrics out over many rounds for better selection
    # TODO - Select only a couple of metrics
    print('FEATURE SELECTION METRICS (FOREST):')
    for j in range(20, 80, 5):
        index = X_tuning.index.tolist()
        cols = X_tuning.columns.tolist()

        # creates selector for given percentile of features
        selector = SelectPercentile(percentile=j)
        X_new = selector.fit_transform(X_tuning, y_tuning)

        temp = selector.get_support(True)

        X_tuning = pd.DataFrame(data=X_tuning,
                                index=index,
                                columns=[cols[i] for i in temp])

        # splits tuning set into training and testing data
        X_train_iter, X_test_iter, y_train_iter, y_test_iter = train_test_split(
            X_tuning, y_tuning, test_size=0.2)

        # fits a basic decision tree model on the data
        forest = RandomForestClassifier(n_estimators=op_estims,
                                        criterion=op_crit,
                                        max_depth=op_depth)
        fitted = forest.fit(X_train_iter, y_train_iter)

        # calculates the accuracy, precision, recall, and f1-measure
        y_pred = fitted.predict(X_test_iter).tolist()
        y_true = y_test_iter

        # NOTE - Order in lists is 'd','s'
        accuracy = accuracy_score(y_true, y_pred)
        precisions_by_label = [
            precision_score(y_true, y_pred, average='binary', pos_label='d'),
            precision_score(y_true, y_pred, average='binary', pos_label='s')
        ]
        precision_global = precision_score(y_true, y_pred, average='micro')
        recalls_by_label = recall_score(y_true, y_pred, average=None)
        recall_global = recall_score(y_true, y_pred, average='micro')
        f1s_by_label = f1_score(y_true, y_pred, average=None)
        f1_global = f1_score(y_true, y_pred, average='micro')

        print(j / 100, 'acc:', round(accuracy, 2), 'precisions_by_label',
              [round(x, 2) for x in precisions_by_label], 'precision_global',
              round(precision_global, 2), 'recalls_by_label',
              [round(x, 2) for x in recalls_by_label], 'recall_global',
              round(recall_global, 2), 'f1s_by_label',
              [round(x, 2) for x in f1s_by_label], 'f1_global',
              round(f1_global, 2))

        X_tuning = X_tuning_copy

    # TODO - Plots for all of the metrics chosen - do it for just one model to give reader idea of the range in performance depending on selected features + how we visualized it
    #ax = plt.gca()
    #ax.plot(percs, [x[1] for x in nrmses])
    #plt.xlabel('percentage of features included')
    #plt.ylabel('NRMSE')
    #plt.title('NMRSE change over feature inclusion thresholds')
    #plt.axis('tight')
    #plt.show(block=False)
    #plt.savefig('linear_feature_threshold_nrmses.png')
    #plt.clf()

    print('Please input a reasonable decimal threshold for feature selection:')
    #thresh = float(input())
    thresh = 1

    # uses the percent threshold to perform feature selection, applying it to training and test sets
    index_test = X_test.index.tolist()
    index_train = X_general.index.tolist()
    cols = X_test.columns.tolist()

    selector = SelectPercentile(percentile=(thresh * 100))
    X_new = selector.fit_transform(X_general, y_general)

    temp = selector.get_support(True)

    X_general = pd.DataFrame(data=X_general,
                             index=index_train,
                             columns=[cols[i] for i in temp])
    X_test = pd.DataFrame(data=X_test,
                          index=index_test,
                          columns=[cols[i] for i in temp])

    # fits a decision tree model to the testing data
    forest = RandomForestClassifier(n_estimators=op_estims,
                                    criterion=op_crit,
                                    max_depth=op_depth)
    fitted = forest.fit(X_general, y_general)

    print(fitted.n_features_)

    # prints evaluation metrics
    y_pred = fitted.predict(X_test).tolist()
    y_true = y_test

    accuracy = accuracy_score(y_true, y_pred)
    precisions_by_label = [
        precision_score(y_true, y_pred, average='binary', pos_label='d'),
        precision_score(y_true, y_pred, average='binary', pos_label='s')
    ]
    precision_global = precision_score(y_true, y_pred, average='micro')
    recalls_by_label = recall_score(y_true, y_pred, average=None)
    recall_global = recall_score(y_true, y_pred, average='micro')
    f1s_by_label = f1_score(y_true, y_pred, average=None)
    f1_global = f1_score(y_true, y_pred, average='micro')

    # TODO - Print metrics more intelligently
    print('FOREST:', 'acc:', round(accuracy, 2), 'precisions_by_label',
          [round(x, 2) for x in precisions_by_label], 'precision_global',
          round(precision_global, 2), 'recalls_by_label',
          [round(x, 2) for x in recalls_by_label], 'recall_global',
          round(recall_global, 2),
          'f1s_by_label', [round(x, 2) for x in f1s_by_label], 'f1_global',
          round(f1_global, 2))

    # TODO - Plot all evaluation metrics!
    # visualizes the residuals
    #plt.xlabel('Predicted Value')
    #plt.ylabel('Residual')
    #plt.title('Residuals (Linear Regression)')
    #plt.axis('tight')
    #plt.savefig('linear_residuals.png')
    #plt.show()
    #plt.clf()

    # saves the model to a file
    #filename = 'forest.sav'
    #pickle.dump(fitted, open(filename, 'wb'))

    # prints the mean accuracy
    #loaded_model = pickle.load(open(filename, 'rb'))
    #result = loaded_model.score(X_test, y_test)
    #print('Mean Accuracy: ', result)

    return fitted
Ejemplo n.º 60
0
	#Concatenate non-categorical data and categorical
	X_train1 = numpy.concatenate((X_train_temp,X_train.iloc[:,10:c-1]),axis=1)
	X_test1 = numpy.concatenate((X_test_temp,X_test.iloc[:,10:c-1]),axis=1)

	scaled_features_train_df = pd.DataFrame(X_train1, index=X_train.index, columns=X_train.columns)
	scaled_features_test_df = pd.DataFrame(X_test1, index=X_test.index, columns=X_test.columns)
#----------------------------------------------------------------
	from sklearn.feature_selection import SelectPercentile
	from sklearn.feature_selection import f_classif
	
	# Write your solution here:
	skb=SelectPercentile(score_func=f_classif,percentile=20)
	predictors=skb.fit_transform(X_train1,Y_train)
	scores=predictors.tolist()
	#print(scores)
	top_k_index=skb.get_support(indices=True)
	print(top_k_index)
	top_k_predictors=predictors[top_k_index]
	print(top_k_predictors)
#---------------------------------------------------------------
	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
	
	clf=OneVsRestClassifier(LogisticRegression())
	clf1=OneVsRestClassifier(LogisticRegression())
	
	model_fit_all_features = clf1.fit(X_train,Y_train)
	predictions_all_features = clf1.predict(X_test)
	
	score_all_features = accuracy_score(Y_test,predictions_all_features)