コード例 #1
0
ファイル: PipeTasks.py プロジェクト: Sandy4321/ProFET
def plotRFECV (X,y,stepSize=0.05,scoring='f1'):
    '''
    Plot recursive feature elimination example with automatic tuning of the number of features selected with cross-validation.
    http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py
    '''
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV

    # Create the RFE object and compute a cross-validated score.
    # svc = SVC(kernel="linear")
    svc = SVC(kernel="linear",class_weight='auto', cache_size=1400)
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=stepSize, cv=StratifiedKFold(y, 2),
                  scoring=scoring)
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    import matplotlib.pyplot as plt
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    return rfecv
コード例 #2
0
def feature_selection_RFE(fn ,ax=None, sel="all", goal="Referee", verbosity=0, nf=7):
    X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1)
    if verbosity > 1:
        print ("names:", ",".join(names))
    
    # Create the RFE object and compute a cross-validated score.
    #estimator = svm.SVC(kernel="linear",C=1.0)
    estimator = get_clf('svm')    
    scoring = 'f1'
    cv = cross_validation.StratifiedKFold(y, 2)

    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    if True:
        rfecv = RFECV(estimator=estimator, step=1, cv=cv, scoring=scoring)
    else:
        from kgml.rfecv import RFECVp
        f_estimator = get_clf('svm')
        rfecv = RFECVp(estimator=estimator,f_estimator=f_estimator, step=1, cv=cv, scoring=scoring)
        
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        rfecv.fit(X, y)

    # Plot number of features VS. cross-validation scores
    ax.set_xlabel("Number of features selected")
    ax.set_ylabel("Cross validation score ({})".format(scoring))
    ax.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

    #print("Optimal number of features : %d" % rfecv.n_features_)
    best = names[rfecv.ranking_==1]
    #print "The best features:", ', '.join(best)
    return best
コード例 #3
0
ファイル: features.py プロジェクト: Patechoc/labs-untested
class RFECVSelection(SelectionModel):
    name = "RFECV"

    def __init__(self, *args):
        SelectionModel.__init__(self, *args)
        self.selector = RFECV(self.estimator, step=1, cv=5, scoring='mean_squared_error')
        self.selector.fit(self.x_array, self.y_array)
        self.support_ = self.selector.support_

    def print_rankings(self):
        print("Rankings for: ", RFECVSelection.name)
        for (i, rank) in zip(self.columns, self.selector.ranking_):
            print("{0}: {1}".format(data.column_names[i], rank))

    # number of features vs. cv scores
    def plot_num_of_feat_vs_cv_score(self):
        plt.figure()
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation scores (mse)")
        plt.plot(range(1, len(self.selector.grid_scores_) + 1),
                self.selector.grid_scores_)
        plt.show()

    def plot_rankings(self):
        plt.figure()
        plt.title("Ranking of features in RFECV")
        plt.bar(range(self.x_array.shape[1]), self.selector.ranking_, align="center", color="r")
        plt.xticks(range(self.x_array.shape[1]), [data.column_names[i] for i in self.columns])
        plt.show()
def RFE_featureSelection(X_train,Y_train):
	## Sampling
	RSObj=randomSampling.randomSampling()
	(X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train)
	X_train.reset_index(drop=True,inplace=True)
	Y_train.reset_index(drop=True,inplace=True)

	## Select classifier and parameters
	logistic = linear_model.LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
	          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
	          penalty='l1', random_state=None, solver='liblinear', tol=0.01,
	          verbose=0, warm_start=False)

	## Initialiaze RFE
	rfecv = RFECV(estimator=logistic, step=1, cv=5,
	              scoring='recall')

	## Fit data
	rfecv.fit(X_train, Y_train)

	## Selected Features
	print("Optimal number of features : %d" % rfecv.n_features_)

	## Plot importance
	plt.figure()
	plt.xlabel("Number of features selected")
	plt.ylabel("Cross validation score")
	plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
	plt.show()

	#print('\n Selectd Columns : {0}').format(list(rfecv.support_))
	print('\n Selectd Columns : {0}').format(X_train.columns[list(rfecv.support_)])
	selected_columns = X_train.columns[list(rfecv.support_)]
	return selected_columns
def benchmark_features_selection(clf,name):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, 2),
              scoring='accuracy')
    rfecv.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    print(name+"Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")    
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

 
    t0 = time()
    pred = rfecv.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("Saving data to database:")
    save_results_data(cursor, name, testing_identifiant_produit_list, pred)
    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr,train_time,test_time
コード例 #6
0
def test_model(model, xtrain, ytrain, feature_list, prefix):
    """ use train_test_split to create validation train/test samples """
    xTrain, xTest, yTrain, yTest = train_test_split(xtrain, ytrain,
                                                    test_size=0.4)

    if DO_RFECV:
        model.fit(xtrain, ytrain)
        if hasattr(model, 'coef_'):
            model = RFECV(estimator=model, verbose=0, step=1,
                          scoring=score_fn, cv=3)

    model.fit(xTrain, yTrain)
    print 'score', model.score(xTest, yTest)
    ypred = model.predict(xTest)
    ### don't allow model to predict negative number of orders
    if any(ypred < 0):
        print ypred[ypred < 0]
        ypred[ypred < 0] = 0

    print 'RMSE', np.sqrt(mean_squared_error(ypred, yTest))

#    debug_output(model, feature_list)

    debug_plots(model, yTest, ypred, prefix)

    return
コード例 #7
0
ファイル: fb_funcs.py プロジェクト: yskmt/kaggle-facebook
def recursive_feature_selection(info_humans, info_bots, params, scale=False):

    X, y, features, scaler = get_Xy(info_humans, info_bots, scale=scale)

    print "first feature selection by variance test"
    skb = VarianceThreshold(threshold=(.8 * (1 - .8)))
    X_new = skb.fit_transform(X)
    features_1 = features[skb.get_support()]

    print "second feature selection by ch2 test"
    skb = SelectKBest(chi2, k=200)
    # skb = SelectFpr(chi2, alpha=0.005)
    X_new = skb.fit_transform(X_new, y)
    features_2 = features_1[skb.get_support()]

    # skb = PCA(n_components=250)
    # X_new = skb.fit_transform(X_new, y)
    
    print "third feature selection by recursive featue elimination (RFECV)"
    clf = LogisticRegression(penalty=params['penalty'],
                             C=params['C'])
    # clf = SVC(kernel="linear")
    rfecv = RFECV(estimator=clf, step=1,
                  cv=cross_validation.StratifiedKFold(y, 5),
                  scoring='roc_auc', verbose=1)
    rfecv.fit(X_new, y)

    print("Optimal number of features : %d" % rfecv.n_features_)
    
    return skb, rfecv
コード例 #8
0
def select_features(clf, x_train, y_train, columns, num_folds, step=19, random_state=0):
    """
    automatic tuning of the number of features selected with cross-validation.
    :param clf: estimator
    :param x_train:
    :param y_train:
    :return: the fitted rfecv object
    """
    print '================= select_features ================'
    # Create the RFE object and compute a cross-validated score.
    cvObj = KFold(len(y_train), n_folds=num_folds, shuffle=True, random_state=random_state)

    # The "accuracy" scoring is proportional to the number of correct classifications
    rfecv = RFECV(estimator=clf, step=step, cv=cvObj, scoring=scorer, verbose=2)
    rfecv.fit(x_train, y_train)

    print '------------ Results: ----------------'
    print '>>>> Optimal number of features : %d' % rfecv.n_features_
    print '>>>> grid scores:'
    pprint(rfecv.grid_scores_)
    print '>>>> ranking of columns:'
    pprint(np.array(columns)[rfecv.ranking_-1])


    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    return rfecv
コード例 #9
0
ファイル: classification.py プロジェクト: ouceduxzk/asap
def get_top_features(windows_data_frame, drop_only_almost_positives = False, drop_duplicates = True, transformer = DEFAULT_TRANSFORMER, \
        classifier = RFECV_FEATURE_SELECTION_DEFAULT_CLASSIFIER, n_folds = 3, step = 0.05, scoring = 'f1'):

    '''
    Using sklearn.feature_selection.RFECV model in order to find the top features of given windows with features, given in a CSV format.
    @param windows_data_frame (pandas.DataFrame):
        A data frame of the windows' CSV.
    @param drop_only_almost_positives (boolean, default False):
        Same as in train_window_classifier.
    @param drop_duplicates (boolean, default True):
        Whether to drop duplicating windows in the dataset, based on their neighbourhood property, prior to RFECV.
    @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler):
        A preprocessing transformer to use for the data before applying RFECV. If None, will not perform any preprocessing transformation.
    @param classifier (sklearn classifier, default a special version of random forest suitable for RFECV):
        The classifier to use as the estimator of RFECV.
    @param n_folds (int, default 2):
        The n_folds to use in the kfold cross-validation as part of the RFECV process.
    @param step (default 0.05):
        See sklearn.feature_selection.RFECV
    @param scoring (default 'f1'):
        See sklearn.feature_selection.RFECV
    @return:
        A list of the top features, each represented as a string.
    '''

    features, X, y = get_windows_data(windows_data_frame, drop_only_almost_positives, drop_duplicates, transformer)
    kfold = StratifiedKFold(y, n_folds = n_folds, shuffle = True, random_state = SEED)
    rfecv = RFECV(estimator = classifier, cv = kfold, step = step, scoring = scoring)
    rfecv.fit(X, y)
    return util.apply_mask(features, rfecv.support_)
コード例 #10
0
ファイル: Plots.py プロジェクト: ncvc/BigData
def recursiveFeatureElimination():
	with DB() as db:
		POIs = getPointsOfInterest()
		numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1

		# for hour in xrange(24):
		plt.figure()
		plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
		fignum = 1
		for POI in POIs:
			x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeatures)
			x, y = np.array(x), np.array(y)

			# Create the RFE object and compute a cross-validated score.
			svr = SVR(kernel="linear")
			rfecv = RFECV(estimator=svr, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy')
			rfecv.fit(x, y)

			print("Optimal number of features : %d" % rfecv.n_features_)

			# Plot number of features VS. cross-validation scores
			plt.subplot(numRows, numCols, fignum)
			plt.title(POI['NAME'])
			plt.xlabel("Number of features selected")
			plt.ylabel("Cross validation score (nb of misclassifications)")
			plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

			fignum += 1
	plt.show()
コード例 #11
0
def main():
    args = getOptions()
    print args

    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear")
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10),
                  scoring='accuracy')
    rfecv.fit(train_x_nor, train_y)
    
    print("Optimal number of features : %d" % rfecv.n_features_)
コード例 #12
0
ファイル: featureSelection.py プロジェクト: ekyauk/BeepBoop
def featureSelection(train_x, train_y):
    # Create the RFE object and compute a cross-validated score.
    svc = LinearSVC(C=1, class_weight='balanced')
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    lasso = RandomizedLasso()
    lasso.fit(train_x, train_y)
    rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy')
    rfecv.fit(train_x, train_y)

    print("Optimal number of features : %d" % rfecv.n_features_)
    rankings = rfecv.ranking_
    lasso_ranks = lasso.get_support()
    lassoFeats = []
    recursiveFeats = []
    shouldUseFeats = []

    for i in range(len(rankings)):
        if lasso_ranks[i]:
            lassoFeats.append(feats[i])
        if rankings[i] == 1:
            recursiveFeats.append(feats[i])
            if lasso_ranks[i]:
                shouldUseFeats.append(feats[i])
    keyboard()
    print 'Should use ' + ', '.join(shouldUseFeats)
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
コード例 #13
0
ファイル: churn.py プロジェクト: melifluos/churn
def run_rfecv(X, y, clf_class, **kwargs):
    clf = clf_class(**kwargs)
    rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy')
    rfecv.fit(X, y)
    plot_rfcev(rfecv)
    print "Optimal number of features : {0} for model: {1}".format(rfecv.n_features_, clf_class)
    return rfecv
コード例 #14
0
ファイル: main2.py プロジェクト: Zerowxm/kdd-cup2009
def plot_rfe(X,label):
    y=X[label]
    X=X.drop(['churn','appetency','upselling',label],axis='columns')
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    # Build a classification task using 3 informative features
#    X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
#                               n_redundant=2, n_repeated=0, n_classes=8,
#                               n_clusters_per_class=1, random_state=0)
    
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear")
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),
                  scoring='accuracy')
    rfecv.fit(X, y)
    
    print("Optimal number of features : %d" % rfecv.n_features_)
    
    # Plot number of features VS. cross-val5idation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
コード例 #15
0
ファイル: test.py プロジェクト: zedoul/air
def optimalFeatures(train,target):
    sk = StratifiedKFold(target,n_folds=3)
    est = SVC(kernel='linear')
    rfecv = RFECV(est,cv=sk)
    rfecv.fit(train,target)
    print("Optimal number of features : %d" % rfecv.n_features_)
    
    
    return rfecv
コード例 #16
0
ファイル: featuresSelection.py プロジェクト: Johayon/BGD-Work
def featureSelection(X,y):
	class RandomForestClassifierWithCoef(RandomForestClassifier):
	    def fit(self, *args, **kwargs):
	        super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
	        self.coef_ = self.feature_importances_
	randfor = RandomForestClassifierWithCoef(n_estimators=35)
	rfecv = RFECV(estimator=randfor, step=1, cv=5,
	               scoring='accuracy',verbose=2)
	rfecv.fit(X,y)
	return X.columns[rfecv.get_support()]
コード例 #17
0
ファイル: actual.py プロジェクト: kenluck2001/AnswerClassify
def selectFeatures (clf, X, Y):
    # Create the RFE object and compute a cross-validated score.
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(Y, 5),
                  scoring='accuracy')
    rfecv.fit(X, Y)
    lst = rfecv.get_support()
    indices = find(lst, True)
    return X[:, indices], indices
コード例 #18
0
def main():
    xtrain=np.load('data/x_train.npy')
    ytrainreg=np.load('data/loss.npy')
    xtrain=xtrain[ytrainreg>0]
    ytrainreg=ytrainreg[ytrainreg>0]
    reg1=linear_model.SGDRegressor(loss='epsilon_insensitive',random_state=0,n_iter=5)
    selector1=RFECV(estimator=reg1,scoring='mean_squared_error',verbose=10)
    selector1.fit(xtrain,np.log(ytrainreg)) #training on the log of the loss
    print "sel1, optimal number of features:", selector1.n_features_
    np.save('features/reg_sel_sgd_eps.npy', selector1.support_)
コード例 #19
0
def feature_selection_with_scikit():
    """
    1-VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t
     meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in
     all samples.
    2-Univariate feature selection works by selecting the best features based on univariate statistical tests.
     It can be seen as a preprocessing step to an estimator
    """
    p=0.8
    selector = VarianceThreshold(threshold=(p * (1 - p)))
    c=selector.fit_transform(X)
    print  "Number of the attribute before: ",X.shape[1]
    print "number of the attribute after:",c.shape[1]

    # selecting k best attribute instead of chi2, f_classif can also be used
    skb=SelectKBest(chi2, k=10)
    X_new=skb.fit_transform(X, y)
    attr=np.where(skb._get_support_mask(),attributeNames,'-1')

    print "Best attribute choosen with SelectKBest: "
    i=1
    for att in attr:
        if att!='-1':
            print i, ": ",att
            i+=1

    #using  ExtraTreesClassifier
    print "Using feature importance..."
    etc=ExtraTreesClassifier()
    etc.fit(X,y).transform(X)
    print etc.feature_importances_
    print etc.max_features
    print etc.max_depth

    print "Recursive feature selection : "
    from sklearn.svm import SVC
    import sklearn.linear_model as lm
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    # Create the RFE object and compute a cross-validated score.
    estim=lm.LinearRegression()
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=estim, step=1, cv=StratifiedKFold(y, 2),
                  scoring='accuracy')
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
コード例 #20
0
ファイル: predictive_analysis.py プロジェクト: orazaro/kgml
def feature_selection_RFE_draft(fn ,ax=None, sel="all", goal="Linebreak", isclass=True,
        verbosity=0, nf=7):
    X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1)
    if verbosity > 1:
        print "names:", ",".join(names)
    
    # Create the RFE object and compute a cross-validated score.
    if isclass:
        #estimator = svm.SVC(kernel="linear",C=1.0)
        estimator = get_clf('svm')    
        scoring = 'f1'
        cv = cross_validation.StratifiedKFold(y, 2)
    else:
        if False:
            from sklearn.ensemble import RandomForestRegressor
            if not hasattr(RandomForestRegressor,'coef_'):
                RandomForestRegressor.coef_ = property(lambda self:self.feature_importances_)
            estimator = RandomForestRegressor(n_estimators=100, max_depth=2, min_samples_leaf=2)
        else:
            estimator = linear_model.RidgeCV()
        scoring = 'mean_squared_error'
        cv = 3

    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    if True:
        rfecv = RFECV(estimator=estimator, step=1, cv=cv, scoring=scoring)
    else:
        from kgml.rfecv import RFECVp
        f_estimator = get_clf('svm')
        rfecv = RFECVp(estimator=estimator,f_estimator=f_estimator, step=1, cv=cv, scoring=scoring)
        
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        rfecv.fit(X, y)

    # Plot number of features VS. cross-validation scores
    ax.set_xlabel("Number of features selected")
    ax.set_ylabel("Cross validation score ({})".format(scoring))
    ax.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

    #print("Optimal number of features : %d" % rfecv.n_features_)
    best = names[rfecv.ranking_==1]

    rfe = RFE(estimator, n_features_to_select=1)
    rfe.fit(X,y)
    ranks = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))

    # reorder best using ranks
    best_set = set(best)
    best = [name for (i,name) in ranks if name in best_set]
    #print "The best features:", ', '.join(best)
    assert len(best) == len(best_set)

    return best, ranks
コード例 #21
0
def select_optimal_features(feature_matrix, y, classifier):

    # print("type of cv is: " + str(cv))
    ################################## preparing feature matirx with optimal features ############################
    # reduced_data = PCA(n_components=25).fit_transform(feature_matrix)

    # print("shape of reduced data before rfecv is: " +str(reduced_data.shape))
    # Create the RFE object and compute a cross-validated score.
    # classifier = SVC(kernel="linear")
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    cv = StratifiedKFold(y, 5)
    # print("type of cv is: " + str(cv))

    rfecv = RFECV(estimator=classifier, step=1, cv=cv, scoring="accuracy")
    print ("going to select optimal features")
    rfecv.fit(feature_matrix, y)
    print ("done selecting optimal features")

    print ("Optimal number of features : %d" % rfecv.n_features_)
    ## ranking_ : array of shape [n_features]
    # The feature ranking, such that ranking_[i] corresponds to the ranking position of the i-th feature.
    # Selected (i.e., estimated best) features are assigned rank 1.
    # print("shape of reduced data after rfecv is: " +str(reduced_data.shape))
    # print("ranking list is: " + str(rfecv.ranking_))
    # print(type(rfecv.ranking_))
    ranked_features = rfecv.ranking_.tolist()

    index = []
    for i in range(0, len(ranked_features)):
        if ranked_features[i] is 1:
            index.append(i)

    print ("index is" + str(index))

    i = 0
    selected_features = np.zeros(shape=(len(feature_matrix), len(index)), dtype=np.float64)
    # initialze with zeros
    for val in index:
        selected_features[:, i] = feature_matrix[:, val]
        i = i + 1

    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

    #     print((selected_features.shape))
    #     path_to_file="/home/ubuntu/Documents/Data_challenge/dc_3/dc_3_try2/"
    #     file_name="selected_features"
    #     with open(path_to_file+file_name,"w") as internal_filename:
    #             pickle.dump(selected_features,internal_filename)

    return selected_features, index
コード例 #22
0
def recursiveFeatSelection():

	X_train, y_train = load_svmlight_file(svmPath + "/" + trainFile)
	X_test, y_test = load_svmlight_file(svmPath + "/" + testFile, n_features=X_train.shape[1])
	
	clf = svm.SVC(kernel='linear', C=1024.0)
	rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, 2),
              scoring='f1')
	rfecv.fit(X_train, y_train)
	
	print("Optimal number of features : %d" % rfecv.n_features_)
コード例 #23
0
def SelectFeatures(featuresStructuresArray, labels):
    estimator = LogisticRegression('l2', False)

    featureNames = featuresStructuresArray.dtype.names
    featureData = castStructuredArrayToRegular(featuresStructuresArray)

    featuresSelector = RFECV(estimator, cv=8)
    featuresSelector.fit(featureData , labels)
    selectedIndices = featuresSelector.get_support()

    selectedFeatures = np.array(featureNames)[selectedIndices]
    return selectedFeatures
コード例 #24
0
def decision_tree():
    print "---bc---"
    clf = tree.DecisionTreeClassifier(criterion="gini")

    rfecv = RFECV(clf, cv=10)

    _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_gini")
    for depth in DEPTHS:
        clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=depth)
        _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_gini" + str(depth))

    clf = tree.DecisionTreeClassifier(criterion="entropy")
    _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_entropy")
    for depth in DEPTHS:
        clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=depth)
        _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_entropy" + str(depth))

    rfecv.fit(bc_data_train, bc_target_train)
    print rfecv.support_
    print rfecv.ranking_
    print rfecv.score(bc_data_test, bc_target_test)
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()


    print "---v---"
    clf = tree.DecisionTreeClassifier(criterion="gini")

    rfecv = RFECV(clf, cv=10)

    _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_gini")
    for depth in DEPTHS:
        clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=depth)
        _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_gini" + str(depth))

    clf = tree.DecisionTreeClassifier(criterion="entropy")
    _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_entropy")
    for depth in DEPTHS:
        clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=depth)
        _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_entropy" + str(depth))

    rfecv.fit(v_data_train, v_target_train)
    print rfecv.support_
    print rfecv.ranking_
    print rfecv.score(v_data_test, v_target_test)
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
コード例 #25
0
ファイル: modeling.py プロジェクト: Divergent914/yakddcup2015
def lr_with_fs():
    """
    Submission: lr_with_fs_0703_01.csv
    E_val:
    E_in:
    E_out:
    """
    from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    import pylab as pl

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    pkl_path = Path.of_cache('lr_with_fs.RFECV.pkl')
    rfe = IO.fetch_cache(pkl_path)
    if rfe is None:
        rfe = RFECV(estimator=LogisticRegression(class_weight='auto'),
                    cv=StratifiedKFold(y, 5), scoring='roc_auc')
        rfe.fit(X_scaled, y)
        IO.cache(rfe, pkl_path)

        print("Optimal number of features : %d" % rfe.n_features_)

        # Plot number of features VS. cross-validation scores
        pl.figure()
        pl.xlabel("Number of features selected")
        pl.ylabel("Cross validation score (AUC)")
        pl.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
        pl.savefig('lr_with_fs.refcv')

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_new, y)

    print('CV scores: %s' % clf.scores_)
    print('Ein: %f' % Util.auc_score(clf, X_new, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('rfe', rfe),
                                 ('scale_new', new_scaler),
                                 ('lr', clf)]), 'lr_with_fs_0703_01')
コード例 #26
0
ファイル: features.py プロジェクト: sirfoga/hal
 def get_best(self):
     """Finds the optimal number of features
     :return: optimal number of features and ranking
     """
     svc = SVC(kernel="linear")
     rfecv = RFECV(
         estimator=svc,
         step=1,
         cv=StratifiedKFold(self.y_train, 2),
         scoring="log_loss"
     )
     rfecv.fit(self.x_train, self.y_train)
     return rfecv.n_features_, rfecv.ranking_
コード例 #27
0
ファイル: classify.py プロジェクト: fxfactorial/macholibre
def feature_selection_rfecv(x, y):
    # Create the RFE object and compute a cross-validated score.
    dtc = DecisionTreeClassifier()
    # The "accuracy" scoring is proportional to the number of correct classifications
    rfecv = RFECV(estimator=dtc, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy')
    rfecv.fit(x, y)
    print 'Optimal number of features: %d' % rfecv.n_features_
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel('Number of features selected')
    plt.ylabel('Cross validation score (nb of correct classifications)')
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
コード例 #28
0
def adjust_optimal_features_using_recursive_feature_elimination(class_name, training_set):
  class_names = map(lambda x: x["classes"][class_name], training_set)
  numerical_characteristics_training_set = map(lambda x: x, map(lambda x: select_numerical_characteristics(x), training_set))
  # Create the RFE object and compute a cross-validated score.
  svc = SVC(kernel="linear")
  # The "accuracy" scoring is proportional to the number of correct classifications
  rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(class_names, 2), scoring='accuracy')
  # ToDo: check if class has more than one representant always
  rfecv.fit(numerical_characteristics_training_set, class_names)
  optimal_features_indexes = [i for i, x in enumerate(rfecv.ranking_) if x == 1]
  print("Optimal number of features : %d" % rfecv.n_features_)
  return map(lambda i: numerical_characteristics[i], optimal_features_indexes)
    # X_new.shape
コード例 #29
0
ファイル: training.py プロジェクト: cms-ttH/ttH-TauRoast
def run_feature_elimination(outdir, bdts, x, y, setup):
    logging.info("starting feature selection")
    for n, bdt in enumerate(bdts):
        rfecv = RFECV(estimator=bdt, step=1, cv=CV, scoring='roc_auc')  # new in 18.1: , n_jobs=NJOBS)
        rfecv.fit(x, y)

        plot_feature_elimination(outdir, rfecv, n)

        out = u'Feature selection\n=================\n\n'
        out += u'optimal feature count: {}\n\nranking\n-------\n'.format(rfecv.n_features_)
        for i, v in enumerate(setup["variables"]):
            out += u'{:30}: {:>5}\n'.format(v, rfecv.ranking_[i])
        with codecs.open(os.path.join(outdir, "bdt-{}".format(n), "log-feature-elimination.txt"), "w", encoding="utf8") as fd:
            fd.write(out)
コード例 #30
0
def rfe_cross_validate(X, y):
    # Create the RFE object and compute a cross-validated score.
    model = LogisticRegression() 
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(y, 2), 
                  scoring='accuracy')
    rfecv.fit(X, y)
    # plot it 
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
    "metric": 'auc',
    "verbosity": -1,
    'reg_alpha': 0.2899927210061127,
    'reg_lambda': 0.4485237330340494,
    'random_state': 53
}

# In[9]:

clf = lgb.LGBMClassifier(**params)
#(n_splits=6, shuffle=False) 'accuracy', 'binary_logloss', 'precision', 'recall'
rfe = RFECV(estimator=clf, step=10, cv=5, scoring='roc_auc', verbose=2)

# In[10]:

rfe.fit(train_x, train_y)

# In[11]:

for col in train_x.columns[rfe.ranking_ == 1]:
    print(col)

# In[14]:

most_influential = pd.DataFrame(
    [col for col in train_x.columns[rfe.ranking_ == 1]], columns=['features'])
most_influential.to_csv('Import_feature.csv')

# In[8]:

most_influential = pd.read_csv('Inputs/Import_feature.csv')
コード例 #32
0
ファイル: Model.py プロジェクト: NachoAG76/Kaggle
score
accuracy=accuracy_score(y_test, classes)

t2=pd.DataFrame(classes)

pd.DataFrame(y_test2).describe()
pd.DataFrame(classes).describe()
#Mélange 2 modeles



import matplotlib.pyplot as plt
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV

# Create the RFE object and compute a cross-validated score.
svc = LogisticRegression(C=0.6)
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc,scoring='log_loss')
rfecv.fit(train,targets_tr)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
コード例 #33
0
    def go(self, all_data, cols, colsP):
        train = all_data.loc[(all_data.SalePrice > 0),
                             cols].reset_index(drop=True, inplace=False)
        y_train = all_data.SalePrice[all_data.SalePrice > 0].reset_index(
            drop=True, inplace=False)
        test = all_data.loc[(all_data.SalePrice == 0),
                            cols].reset_index(drop=True, inplace=False)
        # Main script here
        scale = RobustScaler()
        df = pd.DataFrame(scale.fit_transform(train[cols]), columns=cols)
        #select features based on P values
        ln_model = sm.OLS(y_train, df)
        result = ln_model.fit()
        print(result.summary2())

        pv_cols = cols.values
        SL = 0.051
        pv_cols, LR = self.backwardElimination(df, y_train, SL, pv_cols)

        pred = LR.predict(df[pv_cols])

        y_pred = pred.apply(lambda x: 1 if x > 0.5 else 0)

        print('Fvalue: {:.6f}'.format(LR.fvalue))
        print('MSE total on the train data: {:.4f}'.format(LR.mse_total))

        ls = Lasso(alpha=0.0005,
                   max_iter=161,
                   selection='cyclic',
                   tol=0.002,
                   random_state=101)
        rfecv = RFECV(estimator=ls,
                      n_jobs=-1,
                      step=1,
                      scoring='neg_mean_squared_error',
                      cv=5)
        rfecv.fit(df, y_train)

        select_features_rfecv = rfecv.get_support()
        RFEcv = cols[select_features_rfecv]
        print('{:d} Features Select by RFEcv:\n{:}'.format(
            rfecv.n_features_, RFEcv.values))

        score = r2_score
        ls = Lasso(alpha=0.0005,
                   max_iter=161,
                   selection='cyclic',
                   tol=0.002,
                   random_state=101)
        sbs = SequentialFeatureSelection(ls, k_features=1, scoring=score)
        sbs.fit(df, y_train)

        print('Best Score: {:2.2%}\n'.format(max(sbs.scores_)))
        print('Best score with:{0:2d}.\n'.\
           format(len(list(df.columns[sbs.subsets_[np.argmax(sbs.scores_)]]))))
        SBS = list(df.columns[list(sbs.subsets_[max(
            np.arange(0,
                      len(sbs.scores_))[(sbs.scores_ == max(sbs.scores_))])])])
        print('\nBest score with {0:2d} features:\n{1:}'.format(len(SBS), SBS))

        skb = SelectKBest(score_func=f_regression, k=80)
        skb.fit(df, y_train)
        select_features_kbest = skb.get_support()
        kbest_FR = cols[select_features_kbest]
        scores = skb.scores_[select_features_kbest]

        skb = SelectKBest(score_func=mutual_info_regression, k=80)
        skb.fit(df, y_train)
        select_features_kbest = skb.get_support()
        kbest_MIR = cols[select_features_kbest]
        scores = skb.scores_[select_features_kbest]

        X_train, X_test, y, y_test = train_test_split(df,
                                                      y_train,
                                                      test_size=0.30,
                                                      random_state=101)

        # fit model on all training data
        #importance_type='gain'
        model = XGBRegressor(base_score=0.5,
                             colsample_bylevel=1,
                             colsample_bytree=1,
                             gamma=0,
                             max_delta_step=0,
                             random_state=101,
                             min_child_weight=1,
                             missing=None,
                             n_jobs=4,
                             scale_pos_weight=1,
                             seed=None,
                             silent=True,
                             subsample=1)

        model.fit(X_train, y)

        # Using each unique importance as a threshold
        thresholds = np.sort(np.unique(model.feature_importances_))
        best = 1e36
        colsbest = 31
        my_model = model
        threshold = 0

        for thresh in thresholds:
            # select features using threshold
            selection = SelectFromModel(model, threshold=thresh, prefit=True)
            select_X_train = selection.transform(X_train)
            # train model
            selection_model = XGBRegressor(base_score=0.5,
                                           colsample_bylevel=1,
                                           colsample_bytree=1,
                                           gamma=0,
                                           max_delta_step=0,
                                           random_state=101,
                                           min_child_weight=1,
                                           missing=None,
                                           n_jobs=4,
                                           scale_pos_weight=1,
                                           seed=None,
                                           silent=True,
                                           subsample=1)
            selection_model.fit(select_X_train, y)
            # eval model
            select_X_test = selection.transform(X_test)
            y_pred = selection_model.predict(select_X_test)
            predictions = [round(value) for value in y_pred]
            r2 = r2_score(y_test, predictions)
            mse = mean_squared_error(y_test, predictions)
            print(
                "Thresh={:1.3f}, n={:d}, R2: {:2.2%} with MSE: {:.4f}".format(
                    thresh, select_X_train.shape[1], r2, mse))
            if (best >= mse):
                best = mse
                colsbest = select_X_train.shape[1]
                my_model = selection_model
                threshold = thresh

        feature_importances = [
            (score, feature)
            for score, feature in zip(model.feature_importances_, cols)
        ]
        XGBest = pd.DataFrame(sorted(
            sorted(feature_importances, reverse=True)[:colsbest]),
                              columns=['Score', 'Feature'])
        XGBestCols = XGBest.iloc[:, 1].tolist()

        bcols = set(pv_cols).union(set(RFEcv)).union(set(kbest_FR)).union(
            set(kbest_MIR)).union(set(XGBestCols)).union(set(SBS))
        intersection = set(SBS).intersection(set(kbest_MIR)).intersection(
            set(RFEcv)).intersection(set(pv_cols)).intersection(
                set(kbest_FR)).intersection(set(XGBestCols))
        print(intersection, '\n')
        print('_' * 75, '\nUnion All Features Selected:')
        print('Total number of features selected:', len(bcols))
        print('\n{0:2d} features removed if use the union of selections: {1:}'.
              format(len(cols.difference(bcols)), cols.difference(bcols)))

        totalCols = list(bcols.union(set(colsP)))
        #self.trainingData = self.trainingData.loc[list(totalCols)].reset_index(drop=True, inplace=False)
        #self.testingData = self.testingData.loc[list(totalCols)].reset_index(drop=True, inplace=False)
        #self.combinedData = [self.trainingData, self.testingData]

        return DataObject(self.trainingData, self.testingData,
                          self.combinedData), totalCols, RFEcv, XGBestCols
コード例 #34
0
# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
                           n_features=25,
                           n_informative=3,
                           n_redundant=2,
                           n_repeated=0,
                           n_classes=8,
                           n_clusters_per_class=1,
                           random_state=0)

# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), scoring='accuracy')
rfecv.fit(X, y)

# 输出结果
print("Optimal number of features : %d" % rfecv.n_features_)
# Plot number of features VS. cross-validation scores

# 最优特征的序号
support = np.argwhere(rfecv.support_ == True) + 1
print("Optimal index of features : {}".format(support))

plt.figure()
plt.subplot(1, 2, 1)
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
コード例 #35
0
if full_report:
    dic_rfecv_alg = []
    for clf in [dtc, abc, rfc]:
        t000 = time()

        #### rfecv object
        rfecv = RFECV(estimator=clf,
                      step=1,
                      cv=StratifiedShuffleSplit(fullset_labels,
                                                10,
                                                random_state=15),
                      scoring='f1')

        #### fit on fullset of features
        rfecv_clf = rfecv.fit(fullset_features, fullset_labels)
        rfecv_best = rfecv_clf.estimator_

        rfecv_features_list = []
        for (x, y) in zip(full_features_list[1:], rfecv.support_):
            if y:
                rfecv_features_list.append(x)

        #### populate list of rfecv times
        dic_rfecv_alg.append({
            "algorithm": clf,
            "run_time": round(time() - t000, 2),
            "rfecv_features_list": rfecv_features_list
        })

        #### Plot number of features VS. cross-validation scores
コード例 #36
0
step = 10
if getpass.getuser() == 'stone':
    train_df = train_df[:2000]
    step = 0.1
print('数据量:', train_df.shape)

# ---------------- 特征选择 ------------------

xgb_estimator = XGBClassifier(
    objective='binary:logistic',
    nthread=-1,
    seed=36)

print('XGB交叉验证迭代筛选特征...')
selector = RFECV(xgb_estimator, step=step, cv=5, scoring='roc_auc')
selector.fit(train_df[predictors], train_df[target])

# 筛选特征
print('筛选最优特征...')
selected_features = selector.transform(all_features_df[predictors])
support = pd.Series(selector.support_, index=predictors)

# -------------保存筛选特征后的数据--------------

print("存储选择结果中:")
print('位置:../../data/data_4/selection_result_xgb.csv')
support.to_csv('../../data/data_4/selection_result_xgb.csv', encoding='gbk')

# ----------------输出运行时间  --------------------
time_spend = time.time() - time_begin
print('\n运行时间:%d 秒,约%d分钟\n' % (time_spend, time_spend // 60))
コード例 #37
0
def main():
    # start time
    startTime = time.time()

    # get data
    dataset = '/home/markg/Documents/TCD/ML/ML1819--task-107--team-11/cleanData.csv'
    data = pd.read_csv(dataset, encoding='latin-1')
    data.drop(columns=['Unnamed: 0'], inplace=True)
    # data.drop(columns = ['tweet_count', 'month', 'fav_number',
    # 'month', 'totalLettersName', 'link_hue', 'link_vue', 'link_sat', 'sidebar_sat',
    # 'sidebar_vue'], inplace=True)

    # create independent & dependent variables
    X = data.drop('gender_catg', axis=1)
    Y = data['gender_catg']

    # split into 90% training, 10% testing
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10)

    # train model (could change kernel here)
    svm = SVC(C=1, gamma=0.3, kernel='linear')
    svm.fit(X_train, y_train)

    # make predictions and print metrics
    y_pred = svm.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    # # recursive feature selection without cross validation
    # rfe = RFE(svm, 4)
    # fit = rfe.fit(X_train, y_train)
    # print('Num Features:',fit.n_features_to_select)
    # print("Selected Features:",fit.support_)
    # print ("Feature ranking:", fit.ranking_)
    # data.info()

    # # recursive feature selection using cross validation
    rfecv = RFECV(estimator=svm,
                  step=1,
                  cv=StratifiedKFold(5),
                  scoring='accuracy')
    rfecv.fit(X_train, y_train)
    print("Optimal number of features : %d" % rfecv.n_features_)
    print("Feature ranking: ", rfecv.ranking_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.title("SVM")
    plt.ylabel("Accuracy 5-Fold Cross validation Score")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    # plt.savefig('featuresSelectionSVM.png')
    plt.show()

    # # plot bar chart of feature ranking
    # features = list(X)
    # ranking = rfecv.ranking_
    # plt.bar(features, ranking, align='center', alpha=0.5)
    # # plt.savefig('featureRankingSVM.png')
    # plt.show()

    # cross validation to choose c and gamma
    # C_s, gamma_s = np.meshgrid(np.logspace(-2, 0.3, 5), np.logspace(-2, 0.3, 5))
    # scores = list()
    # i=0; j=0
    # for C, gamma in zip(C_s.ravel(),gamma_s.ravel()):
    #     svm.C = C
    #     svm.gamma = gamma
    #     this_scores = cross_val_score(svm, X, Y, cv=3)
    #     scores.append(np.mean(this_scores))
    # scores=np.array(scores)
    # scores=scores.reshape(C_s.shape)
    # fig2, ax2 = plt.subplots(figsize=(12,8))
    # c=ax2.contourf(C_s,gamma_s,scores)
    # ax2.set_xlabel('C')
    # ax2.set_ylabel('gamma')
    # fig2.colorbar(c)
    # fig2.savefig('crossvalParameterSelection2.png')

    # end time
    endTIme = time.time()
    totalTime = endTIme - startTime
    print("Time taken:", totalTime)
def roc_auc_with_multi_labels(estimator, X, y, n_classes=3, weight='micro'):
    '''
    ROC 原始定义是适用于二分类,如何可以参考
    https://www.jianshu.com/p/00ef5b63dfc8
    '''
    y_pred = estimator.predict_proba(X)
    y_pred_ = np.array([_[:, 1] for _ in y_pred]).T
    y_test = np.array([list(map(int, _.split(','))) for _ in y])
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(y_test.shape[1]):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred_[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area(方法二)
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred_.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    if weight == 'micro':
        return roc_auc["micro"]
    elif weight == 'macro':
        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])

        # Finally average it and compute AUC
        mean_tpr /= n_classes
        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
        return roc_auc["macro"]
    else:
        raise ValueError('"weight" is not in ("macro", "micro")


def plot_roc_cv(Scores):
    Scores_arr = np.array(Scores)
    n_fea = Scores_arr.shape[1]
    fig, ax = plt.subplots()

    ax.fill_between(range(1, Scores_arr.shape[1]+1), Scores_arr.max(axis=0), Scores_arr.min(axis=0), color='skyblue')
    ax.plot(range(1, n_fea+1), Scores_arr.max(axis=0), '-*', lw=2, color='orange', label='max')
    ax.plot(range(1, n_fea+1), Scores_arr.min(axis=0), '-*', lw=2, color='blue', label='min')
    ax.plot(range(1, n_fea+1), Scores_arr.mean(axis=0), '--', lw=1, color='green', label='mean')
    ax.plot(range(1, n_fea+1), Scores_arr.mean(axis=0) + Scores_arr.std(axis=0), '--', lw=1.5,color='darksalmon', label='mean + std')
    ax.plot(range(1, n_fea+1), Scores_arr.mean(axis=0) - Scores_arr.std(axis=0), '--', lw=1.5,color='darkviolet', label='mean - std')
    ax.plot(range(1, n_fea+1), [max(Scores_arr.max(axis=0))]* n_fea, '--', lw=1,color='lawngreen')
    ax.plot(range(1, n_fea+1), [min(Scores_arr.min(axis=0))]* n_fea, '--', lw=1,color='lawngreen')
    plt.legend(loc='upper right', fontsize=7)
    plt.ylim([0, 1])
    plt.xlim([1, 9])
    plt.xlabel('feature num')
    plt.ylabel('AUC')
    yticks = [_/10 for _ in  range(0, 11, 2)]
    yticks.append(round(max(Scores_arr.max(axis=0)), 3))
    yticks.append(round(min(Scores_arr.min(axis=0)), 3))
    yticks = sorted(yticks)
    plt.yticks(yticks, yticks)

    plt.title('Feature importance selected by RFECV \n with RF(100 tree)')
    plt.show()


if __name__ == '__main__':
    df = pd.read_csv('feature_label.csv', sep='\t')
    X = df[df.columns[:-2]]
    y = df['Transfer'].values                  
    N = 100
    Scores = []
    Best_fea_num = []
    Ranks = []
    t0 = time.time()
    
    for i in range(1, N+1):
        y1 = label_binarize(y, classes=['Normal', 'I_III', 'IV'])
        X1, y1 = shuffle(X, y1, random_state=i)
        y1 = list(map(lambda x:'{},{},{}'.format(*x), y1))
        model = RandomForestClassifierRefcv(n_estimators=100)
        rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(6),
                  n_jobs=-1, scoring=roc_auc_with_multi_labels_binary)
        rfecv.fit(X1, y1)
        Best_fea_num.append(rfecv.n_features_)
        Scores.append(rfecv.grid_scores_)
        Ranks.append(rfecv.ranking_)
        if i % 10 == 0:
            t1 = time.time()
            print('finish {} round test in {} sec'.format(i, t1-t0))
            t0 = t1
   plot_roc_cv(Scores)
コード例 #39
0
#%%
select_feature = SelectKBest(chi2, k=3).fit(x_train, y_train)
x_train_chi = select_feature.transform(x_train)
x_test_chi = select_feature.transform(x_test)
# %%
lr_chi_model = clf_lr.fit(x_train_chi, y_train)

# %%
rfe = RFE(estimator=clf_lr, step=1)
rfe = rfe.fit(x_train, y_train)
x_train_rfe = rfe.transform(x_train)
x_test_rfe = rfe.transform(x_test)
lr_rfe_model = clf_lr.fit(x_train_rfe, y_train)

# %%
rfecv = RFECV(estimator=clf_lr, step=1, cv=5, scoring='accuracy')
rfecv = rfecv.fit(x_train, y_train)

#%%
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validated Accuracy")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, '-o')
plt.title("Recursive Feature Elimination with Cross Validation")
plt.show()
plt.savefig("D:\\Autumn 2019\\Research\\Feature_Extracting\\StimOn")

# %%
コード例 #40
0
def get_new_mask_2(X, y):
    clf = LinearRegression()
    rfecv = RFECV(clf, step=1, cv=2)
    selector = rfecv.fit(X, y)
    return selector.support_
コード例 #41
0
def get_new_mask(X, y, model=LinearRegression()):
    clf = model
    rfecv = RFECV(clf, step=1, cv=3)
    selector = rfecv.fit(X, y)
    return selector.support_
コード例 #42
0
    plt.figure()
    sns.distplot(train['wind_speed'])
    plt.figure()
    sns.distplot(train['wind_direction'])
    plt.figure()
    sns.distplot(train['precipitation'])
    plt.figure()
    sns.distplot(train['temp'])

    # Feature selection
    features = [
        'wind_speed', 'wind_direction', 'temp', 'precipitation', 'hour',
        'month'
    ]
    max_power = train['Power'].max()
    train['Power'] = train['Power'].divide(max_power)
    train, test = train_test_split(train, train_size=0.85, shuffle=False)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train[features].values)
    y_train = train['Power'].divide(train['Power'].max()).values

    model = lgb.LGBMRegressor()
    rfe = RFECV(estimator=model, cv=10, step=1)
    rfe = rfe.fit(X_train, y_train)
    print rfe.support_

    model = xgb.XGBRegressor()
    rfe = RFECV(estimator=model, cv=10, step=1)
    rfe = rfe.fit(X_train, y_train)
    print rfe.support_
            'NumCharacter'
        ]])

test_X.to_csv('test_cleaned.csv')
train_X.to_csv('train_X_cleaned.csv')
train_Y.to_csv('train_Y_cleaned.csv')

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(train_X,
                                                    train_Y,
                                                    test_size=0.2,
                                                    random_state=0)
model = LinearRegression()
rfecv = RFECV(model, step=1, scoring='neg_mean_squared_error')
rfecv.fit(X_train, y_train.values.ravel())
# Recursive feature elimination
# Number of best features
rfecv.n_features_
# The number of best features is 56, which means based on recursvie cross validation result , we should use every feature.
rfecv.support_
rfecv.ranking_
#Reduced X_test and X_train to the selected features
rfecv.transform(X_train)
#Use the current model to predict X_test value
ypred1 = rfecv.predict(X_test)
mean_squared_error(y_test.values, ypred1)
#  0.07404738031954539

#Maybe try another feature selection method
コード例 #44
0
ファイル: find.py プロジェクト: andrewcistola/fracture-proof
forest = RandomForestRegressor(n_estimators = 1000, max_depth = 10) #Use default values except for number of trees. For a further explanation see readme included in repository. 
forest.fit(X, Y) # Fit Forest model, This will take time
rf = forest.feature_importances_ # Output importances of features
l_rf = list(zip(X, rf)) # Create list of variables alongside importance scores 
df_rf = pd.DataFrame(l_rf, columns = ["Features", "Gini"]) # Create data frame of importances with variables and gini column names
df_rf = df_rf[(df_rf["Gini"] > df_rf["Gini"].mean())] # Subset by Gini values higher than mean
df_rf = df_rf.sort_values(by = ["Gini"], ascending = False) # Sort Columns by Value
print(df_rf)

### Recursive Feature Elimination
df_pca_rf = pd.merge(df_pca, df_rf, on = "Features", how = "inner") # Join by column while keeping only items that exist in both, select outer or left for other options
pca_rf = df_pca_rf["Features"].tolist() # Save features from data frame
X = df_prep[pca_rf] # Save features columns as predictor data frame
Y = df_prep["quant"] # Selected quantitative outcome from original data frame
recursive = RFECV(estimator = LinearRegression(), min_features_to_select = 5) # define selection parameters, in this case all features are selected. See Readme for more ifo
recursive.fit(X, Y) # This will take time
rfe = recursive.support_ # Save Boolean values as numpy array
l_rfe = list(zip(X, rfe)) # Create list of variables alongside RFE value 
df_rfe = pd.DataFrame(l_rfe, columns = ["Features", "RFE"]) # Create data frame of importances with variables and gini column names
df_rfe = df_rfe[df_rfe.RFE == True] # Select Variables that were True
print(df_rfe)

### Multiple Regression
pca_rf_rfe = df_rfe["Features"].tolist() # Save chosen featres as list
X = df_prep.filter(pca_rf_rfe) # Keep only selected columns from rfe
Y = df_prep["quant"] # Add outcome variable
regression = LinearRegression() # Linear Regression in scikit learn
regression.fit(X, Y) # Fit model
coef = regression.coef_ # Coefficient models as scipy array
l_reg = list(zip(X, coef)) # Create list of variables alongside coefficient 
df_reg = pd.DataFrame(l_reg, columns = ["Features", "Coefficients"]) # Create data frame of importances with variables and gini column names
コード例 #45
0
''' 1: Logistic Regression_v1'''
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression
mod_lr = LogisticRegression()
mod_lr.fit(X, y)
y_pred = mod_lr.predict(X)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
confusion_matrix(y, y_pred)
metrics.accuracy_score(y, y_pred)

rfe = RFECV(mod_lr, min_features_to_select=10, step=1, cv=5)
#rfe = RFE(mod_lr, 10, step = 1)
fit = rfe.fit(X, y)

print("Num Attribute: %d" % fit.n_features_)
print("Selected Attribute: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

feature_imp = pd.DataFrame({
    'Attribute': df.iloc[:, 2:].columns.tolist(),
    'Select': fit.support_,
    'Rank': fit.ranking_
}).sort_values(by='Rank', ascending=True)

pickup = feature_imp[feature_imp['Rank'] == 1]['Attribute']
#pickup  = df_us.iloc[:, np.r_[3:15, 39:57, 94:187]].columns[fit.support_]

X = df.loc[:, pickup].values
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt

# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
                           n_features=25,
                           n_informative=3,
                           n_redundant=2,
                           n_repeated=0,
                           n_classes=8,
                           n_clusters_per_class=1,
                           random_state=0)

# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct classifications.
rfecv = RFECV(estimator=svc,
              step=1,
              cv=StratifiedKFold(y, 2),
              scoring='accuracy')
rfecv.fit(X, y)

print "Optimal number of features : %d" % rfecv.n_features_

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
コード例 #47
0
def main():
    # 写入数据
    print('*' * 20, "程序开始-读取数据", '*' * 20)
    X_Train = commonFunc.parseFile('../UCI HAR Dataset/train/X_train.txt')
    Y_Train = commonFunc.parseFile(
        '../UCI HAR Dataset/train/y_train.txt').flatten()
    X_Test = commonFunc.parseFile('../UCI HAR Dataset/test/X_test.txt')
    Y_Test = commonFunc.parseFile(
        '../UCI HAR Dataset/test/y_test.txt').flatten()
    activityLabels = [
        'WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING',
        'STANDING', 'LAYING'
    ]
    print("数据读取完成~\n")

    # 参数设置
    print('*' * 20, "设置参数表", '*' * 20)
    print("LDA:")
    SOLVER = 'svd'
    print("solver:{0}".format(SOLVER))
    print("RFE:")
    STEPSET = 5
    MINFEATURETOSET = 300
    CROSSVALIDATION = 20
    CPUCHANNEL = 6
    print(
        "estimate:LDA \t step:{0} \t min_feature_to_select:{1} \t CrossValidation:{2} \t CPUChannel:{3} \n"
        .format(STEPSET, MINFEATURETOSET, CROSSVALIDATION, CPUCHANNEL))

    # 特征选择
    print('*' * 20, "读取特征文件", '*' * 20)
    maskSaveName = "LDA-features-mask.out"
    if (os.path.exists(maskSaveName)):
        print("存在特征文件,开始读取...")
        maskInteger = np.loadtxt(maskSaveName)
        mask = (maskInteger == 1)
        print("读取完成,准备显示...")
        print("特征选择数量: {0}".format(sum(mask == 1)))
    else:
        print("特征文件不存在~")
        print("开始特征选择...")
        start = perf_counter()
        estimator = LDA(solver=SOLVER)
        selector = RFECV(estimator,
                         step=STEPSET,
                         min_features_to_select=MINFEATURETOSET,
                         cv=CROSSVALIDATION,
                         n_jobs=CPUCHANNEL)
        selector = selector.fit(X_Train, Y_Train)
        mask = selector.get_support()
        print("特征选择完成!")
        print("用时 {0:.2f}mins".format((perf_counter() - start) / 60))
        print("特征选择数量: {0}".format(sum(mask == 1)))
        np.savetxt(maskSaveName, mask, fmt='%d')

    # 画图
    plt.figure(figsize=(14, 14))
    plt.subplot(2, 2, (1, 2))
    plt.imshow(mask.reshape(1, -1), cmap='tab20c_r')
    plt.title("Feature Selected: {0}".format(sum(mask == 1)),
              fontsize=14,
              y=2.5)
    plt.ylim([-5, 5])
    plt.xlabel("Feature Index(Deeper Color means Selected)", fontsize=10)
    #    plt.show()
    print('\n')

    # 选择特征抽取
    print('*' * 20, "特征选择后的数据结果", '*' * 20)
    X_Train_selected = X_Train[:, mask]
    X_Test_selected = X_Test[:, mask]
    clf_selected = LDA(solver=SOLVER)
    clf_selected.fit(X_Train_selected, Y_Train)
    Y_predict_selected = clf_selected.predict(X_Test_selected)
    prec_selected, rec_selected, f_score_selected = commonFunc.checkAccuracy(
        Y_Test, Y_predict_selected)
    print("训练结果:")
    print("准确率:{0}\n召回率:{1}\nF1度量:{2}".format(prec_selected, rec_selected,
                                              f_score_selected))

    # 混淆矩阵
    plt.subplot(2, 2, 3)
    cm = commonFunc.createConfusionMatrix(Y_predict_selected, Y_Test)
    plot_confusion_matrix(cm,
                          activityLabels,
                          normalize=False,
                          title='Selected_F Confusion matrix')
    print('\n')

    # 原始数据的训练结果
    print('*' * 20, "特征选择前的数据结果", '*' * 20)
    clf = LDA(solver=SOLVER)
    clf.fit(X_Train, Y_Train)
    Y_predict = clf.predict(X_Test)
    prec, rec, f_score = commonFunc.checkAccuracy(Y_Test, Y_predict)
    print("训练结果:")
    print("准确率:{0}\n召回率:{1}\nF1度量:{2}".format(prec, rec, f_score))

    # 混淆矩阵
    plt.subplot(2, 2, 4)
    cm = commonFunc.createConfusionMatrix(Y_predict, Y_Test)
    plot_confusion_matrix(cm,
                          activityLabels,
                          normalize=False,
                          title='All_F Confusion matrix')

    #    plt.tight_layout()
    plt.show()
コード例 #48
0
# Diccionario que mapea la RFE Accuracy con un índice
dict_1 = {}
# Diccionario que mapea un índice con el objeto RFECV
dict_2 = {}

time_prebucle = time.time()

# Itero sobre los posibles valores de C
for i, c in enumerate(C):
    time_temp1 = time.time()
    clf_temp = SVC(C=c,
                   kernel=kernel,
                   class_weight=class_weight,
                   random_state=random_state)
    rfecv_temp = RFECV(clf_temp, cv=skf, scoring=scoring)
    rfecv_temp.fit(X, y)
    dict_1[rfecv_temp.grid_scores_[rfecv_temp.n_features_]] = i
    dict_2[i] = rfecv_temp
    time_temp2 = time.time()
    print(f'Time iteration {i}: {time_temp2-time_temp1}')

time_bucle = time.time()
print(f'Time loop: {time_bucle-time_prebucle}')

maximo = max(dict_1)
indice_maximo = dict_1[maximo]
rfecv = dict_2[indice_maximo]
best_c = rfecv.estimator_.get_params()['C']
print(f'Best C: {best_c}')

# Imprimimos el número de características resultante
コード例 #49
0
ファイル: rfecv_fnc_ica.py プロジェクト: BhaskarRay/BrainAge
import warnings
warnings.filterwarnings('ignore')
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn import preprocessing

intersect_fnc_l = pd.read_csv("/data/mialab/users/bray14/rfecv_fnc/intersect_fnc_ica_2_filtered.csv")
X = intersect_fnc_l.iloc[:,5:105]
target = intersect_fnc_l['age_at_cnb']
mm_scaler = preprocessing.MinMaxScaler()
X_train_minmax = mm_scaler.fit_transform(X)
Svr_linear = SVR(kernel='linear')
rfecv = RFECV(estimator=Svr_linear, step=1, cv=StratifiedKFold(10), scoring='neg_root_mean_squared_error')
rfecv.fit(X_train_minmax, target)

print('Optimal number of features: {}'.format(rfecv.n_features_))


coef_rmse_rfecv= rfecv.estimator_.coef_
rfecv_rmse_featureCoeff = pd.DataFrame()
rfecv_rmse_featureCoeff['attr'] = X.columns[rfecv.support_]
rfecv_rmse_featureCoeff['coefficient'] = coef_rmse_rfecv.transpose(1,0)
rfecv_rmse_featureCoeff['rank']= rfecv.ranking_[rfecv.support_]
rfecv_rmse_featureCoeff = rfecv_rmse_featureCoeff.sort_values(by='coefficient', ascending=False)
rfecv_rmse_featureCoeff.to_csv('rfecv_rmse_featureCoeff.csv',encoding='utf-8',index=False,na_rep='NA')

plt.figure(figsize=(16, 9))
plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
コード例 #50
0
ledroit_Evaluator = FeaturesEvaluator.FeaturesEvaluator(reg_ledroit, X_train_ledroit, y_train_ledroit, X_test_ledroit, y_test_ledroit)








params = {'n_estimators': 128,
        'max_depth': 7,
        'min_samples_split': 5,
        'learning_rate': 0.01,
        'loss': 'ls'}

reg_ledroit = ensemble.GradientBoostingRegressor(**params)

selector = RFECV(reg_ledroit, step=1, cv=5, verbose =3 , min_features_to_select=30)
selector = selector.fit(X_train_ledroit, y_train_ledroit)

selector.support_
selector.grid_scores_
X_to_remove = X_train_ledroit.keys()[np.logical_not(selector.support_)]

reg_ledroit.fit(X_train_ledroit, y_train_ledroit)


predict_ledroit = reg_ledroit.predict(X_test_ledroit)

test = pd.DataFrame(selector.transform(X_train_ledroit))
コード例 #51
0
 def Find(df, quant, path="", title=""):
     import pandas as pd  # Widely used data manipulation library with R/Excel like tables named 'data frames'
     import numpy as np  # Widely used matrix library for numerical processes
     from sklearn.decomposition import PCA  # Principal compnents analysis from sklearn
     from sklearn.ensemble import RandomForestRegressor  # Random Forest classification component
     from sklearn.feature_selection import RFECV  # Recursive Feature elimination with cross validation
     from sklearn.linear_model import LinearRegression  # Used for machine learning with quantitative outcome
     pop = df.pop(quant)  # Remove quantitative outcome
     df = df.dropna(
         axis=1, thresh=0.75 * len(df)
     )  # Drop features less than 75% non-NA count for all columns
     df = pd.DataFrame(SimpleImputer(strategy="median").fit_transform(df),
                       columns=df.columns)  # Impute missing data
     df = pd.DataFrame(
         StandardScaler().fit_transform(df.values), columns=df.columns
     )  # Standard scale values by converting the normalized features into a tabular format with the help of DataFrame.
     df = df.dropna(
     )  # Drop all rows with NA values (should be none, this is just to confirm)
     degree = len(
         df.columns
     ) - 1  # Save number of features -1 to get degrees of freedom
     pca = PCA(
         n_components=degree
     )  # Pass the number of components to make PCA model based on degrees of freedom
     pca.fit(df)  # Fit initial PCA model
     df_comp = pd.DataFrame(
         pca.explained_variance_)  # Print explained variance of components
     df_comp = df_comp[(
         df_comp[0] > 1)]  # Save eigenvalues above 1 to identify components
     components = len(
         df_comp.index
     ) - 1  # Save count of components for Variable reduction
     pca = PCA(n_components=components
               )  # you will pass the number of components to make PCA model
     pca.fit_transform(
         df
     )  # finally call fit_transform on the aggregate data to create PCA results object
     df_pc = pd.DataFrame(
         pca.components_, columns=df.columns
     )  # Export eigenvectors to data frame with column names from original data
     df_pc[
         "Variance"] = pca.explained_variance_ratio_  # Save eigenvalues as their own column
     df_pc = df_pc[df_pc["Variance"] > df_pc["Variance"].mean(
     )]  # Susbet by eigenvalues with above average exlained variance ratio
     df_pc = df_pc.abs()  # Get absolute value of eigenvalues
     df_pc = df_pc.drop(columns=["Variance"])  # Drop outcomes and targets
     df_pca = pd.DataFrame(
         df_pc.max(),
         columns=["MaxEV"])  # select maximum eigenvector for each feature
     df_pca = df_pca[
         df_pca.MaxEV >
         df_pca.MaxEV.mean()]  # Susbet by above average max eigenvalues
     df_pca = df_pca.reset_index(
     )  # Add a new index of ascending values, existing index consisting of feature labels becomes column named "index"
     df_pca = df_pca.rename(columns={"index": "Features"
                                     })  # Rename former index as features
     print(df_pca)
     df.insert(0, "quant",
               pop)  # Reattach qunatitative outcome to front of data frame
     X = df.drop(columns=["quant"])  # Drop outcomes and targets
     Y = df["quant"]  # Isolate Outcome variable
     forest = RandomForestRegressor(
         n_estimators=1000, max_depth=10
     )  #Use default values except for number of trees. For a further explanation see readme included in repository.
     forest.fit(X, Y)  # Fit Forest model, This will take time
     rf = forest.feature_importances_  # Output importances of features
     l_rf = list(zip(
         X, rf))  # Create list of variables alongside importance scores
     df_rf = pd.DataFrame(
         l_rf, columns=["Features", "Gini"]
     )  # Create data frame of importances with variables and gini column names
     df_rf = df_rf[(df_rf["Gini"] > df_rf["Gini"].mean()
                    )]  # Subset by Gini values higher than mean
     print(df_rf)
     df_pca_rf = pd.merge(
         df_pca, df_rf, on="Features", how="inner"
     )  # Join by column while keeping only items that exist in both, select outer or left for other options
     pca_rf = df_pca_rf["Features"].tolist(
     )  # Save features from data frame
     X = df[pca_rf]  # Save features columns as predictor data frame
     Y = df[
         "quant"]  # Selected quantitative outcome from original data frame
     recursive = RFECV(
         estimator=LinearRegression(), min_features_to_select=5
     )  # define selection parameters, in this case all features are selected. See Readme for more ifo
     recursive.fit(X, Y)  # This will take time
     rfe = recursive.support_  # Save Boolean values as numpy array
     l_rfe = list(zip(X,
                      rfe))  # Create list of variables alongside RFE value
     df_rfe = pd.DataFrame(
         l_rfe, columns=["Features", "RFE"]
     )  # Create data frame of importances with variables and gini column names
     df_rfe = df_rfe[df_rfe.RFE == True]  # Select Variables that were True
     print(df_rfe)
     pca_rf_rfe = df_rfe["Features"].tolist()  # Save chosen featres as list
     X = df.filter(pca_rf_rfe)  # Keep only selected columns from rfe
     Y = df["quant"]  # Add outcome variable
     regression = LinearRegression()  # Linear Regression in scikit learn
     regression.fit(X, Y)  # Fit model
     coef = regression.coef_  # Coefficient models as scipy array
     l_reg = list(zip(
         X, coef))  # Create list of variables alongside coefficient
     df_reg = pd.DataFrame(
         l_reg, columns=["Features", "Coefficients"]
     )  # Create data frame of importances with variables and gini column names
     print(df_reg)
     df_final = pd.merge(
         df_pca_rf, df_reg, on="Features", how="inner"
     )  # Join by column while keeping only items that exist in both, select outer or left for other options
     final = df_final["Features"].tolist()  # Save chosen featres as list
     print(df_final)  # Show in terminal
     df_final.to_csv(path + title +
                     "_fp_v1.4_quant.csv")  # Export df as csv
コード例 #52
0
# Fit the best algorithm to the data.
clf.fit(X_train_cross, Y_train_cross)

predictions = clf.predict(X_test_cross)
print(accuracy_score(Y_test_cross, predictions))

plot_variable_importance(X_train_cross, Y_train_cross)

print(clf.score(X_train_cross, Y_train_cross),
      clf.score(X_test_cross, Y_test_cross))

rfecv = RFECV(estimator=clf,
              step=1,
              cv=StratifiedKFold(Y_train_cross, 2),
              scoring='accuracy')
rfecv.fit(X_train_cross, Y_train_cross)


# KFold
def run_kfold(clf):
    kf = KFold(891, n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train_cross, X_test_cross = X_train.values[
            train_index], X_train.values[test_index]
        Y_train_cross, Y_test_cross = Y_train.values[
            train_index], Y_train.values[test_index]
        clf.fit(X_train_cross, Y_train_cross)
        predictions = clf.predict(X_test_cross)
コード例 #53
0
import numpy
import pickle

numpy.set_printoptions(suppress=True)

JOBS = 10
SEED = 0

types = {f'V{i}': 'float32' for i in range(1, 29)}
types['Amount'] = 'float32'

X = pandas.read_csv('./data/features.csv', header=0, dtype=types)
y = pandas.read_csv('./data/target.csv', header=0, dtype={'Class': 'int32'})

rf = RandomForestClassifier(random_state=SEED)
xgb = XGBClassifier(random_state=SEED)

selector = RFECV(
    estimator=xgb,
    step=1,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED),
    n_jobs=JOBS,
    verbose=10,
    scoring='precision',
    min_features_to_select=1,
)

filename = './artifacts/rfe_precision_xgb.pkl'
rfe = selector.fit(X.to_numpy(), y.to_numpy().reshape(-1, ))
pickle.dump(obj=rfe, file=open(filename, 'wb'))
コード例 #54
0
plt.show()

#--------------------------------------------------------
"""
Feature Engineering
Logistic Regression — Feature Selection
"""
#--------------------------------------------------------
from sklearn.feature_selection import RFECV

logreg_model = LogisticRegression()
rfecv = RFECV(estimator=logreg_model,
              step=1,
              cv=strat_k_fold,
              scoring='accuracy')
rfecv.fit(X, y)

plt.figure()
plt.title('Logistic Regression CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

feature_importance = list(zip(feature_names, rfecv.support_))
new_features = []
for key, value in enumerate(feature_importance):
    if (value[1]) == True:
        new_features.append(value[0])

print(new_features)
コード例 #55
0
from sklearn.model_selection import KFold

### PARSING
df0=pd.read_csv('RAs_and_rescue.csv').drop(columns=['genotype']).set_index('names')

### SCALING RELATIVE ABUNDANCES
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df0.drop(columns='group')))
df.index=df0.index
df.columns=df0.columns[:-1]
df=df.merge(df0[['group']], left_index=True, right_index=True)

### SVM-RFE
clf = svm.SVC(kernel='linear')
rfe = RFECV(clf, step=1, cv=KFold(n_splits=df.shape[0]),min_features_to_select=1,n_jobs=40)
rfe.fit(df.drop(columns=['group']),df['group'])

# RETURNING RESULTS
print('\nscore= ', sum(rfe.grid_scores_)/len(rfe.grid_scores_))
print('\n')
cols=rfe.get_support(indices=True)
cols =[df.columns[:-1][i] for i in cols]
coeffs=pd.DataFrame()
coeffs['svm_coeff']=list(rfe.estimator_.coef_[0])
coeffs['names']=cols
print(coeffs.set_index('names').sort_values(by='svm_coeff'))
print(len(coeffs),' features kept')

coeffs.to_csv('svm_output.csv')

コード例 #56
0
#on very high-dimensional datasets.

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
lr = LogisticRegression()

rfecv = RFECV(estimator=lr,
              step=1,
              cv=StratifiedKFold(labels, 50),
              scoring='precision')
rfecv.fit(features, labels)
print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

# Payment ratio of poi = salary/total_payment, gives the ratio of above values.
#popping out outliers:
data_dict.pop('TOTAL', 0)
data_dict.pop('THE TRAVEL AGENCY IN THE PARK', 0)

# Engineered features , this could give some more insights on usage of available data:
コード例 #57
0
X, y = dataset.data, dataset.target
features = dataset.feature_names

#==============================================================================
# CV MSE before feature selection
#==============================================================================
est = LinearRegression()
score = -1.0 * cross_val_score(
    est, X, y, cv=5, scoring="neg_mean_squared_error")
print("CV MSE before feature selection: {:.2f}".format(np.mean(score)))

#==============================================================================
# CV MSE after feature selection: RFE
#==============================================================================
rfe = RFECV(est, cv=5, scoring="neg_mean_squared_error")
rfe.fit(X, y)
score = -1.0 * cross_val_score(
    est, X[:, rfe.support_], y, cv=5, scoring="neg_mean_squared_error")
print("CV MSE after RFE feature selection: {:.2f}".format(np.mean(score)))

#==============================================================================
# CV MSE after feature selection: Feature Importance
#==============================================================================
rf = RandomForestRegressor(n_estimators=500, random_state=SEED)
rf.fit(X, y)
support = rf.feature_importances_ > 0.01
score = -1.0 * cross_val_score(
    est, X[:, support], y, cv=5, scoring="neg_mean_squared_error")
print("CV MSE after Feature Importance feature selection: {:.2f}".format(
    np.mean(score)))
コード例 #58
0
# Load libraries
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model

# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore",
                        module="scipy",
                        message="^internal gelsd")

# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples=10000,
                                   n_features=100,
                                   n_informative=2,
                                   random_state=1)

# Create a linear regression
ols = linear_model.LinearRegression()

# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)
コード例 #59
0
def feature_selection_classifier(features, labels, estimator=None):
    if estimator is None:
        estimator = SVC(kernel='linear', C=0.1, gamma=1)
    classifier = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(10), scoring='accuracy', n_jobs=-1)
    classifier.fit(features, labels)
    return classifier
コード例 #60
0
ファイル: main.py プロジェクト: xhxseu/DMProj
plt.title('ROC')
plt.show()

score
lg_auc
#%%
from utility_functions import generate_result_csv
res = {}
#svm = fit_by_label(la, fea, 10, params)
res[la] = predict_by_label(la, fea, svm)
generate_result_csv(res, 'result_d5.csv')

#%%
clf = SVC(kernel='linear')
selector = RFECV(clf, 2)
selector.fit(X_train, y_train)

#%%
X_train_red = X_train[:, selector.support_]
X_test_red = X_test[:, selector.support_]
clf = SVC(kernel='rbf', gamma=gs.best_params_['gamma'], C=gs.best_params_['C'])
clf.fit(X_train_red, y_train)
clf.score(X_test_red, y_test)

#%%
#la1 = 'Dog_4'
#fea1 = ['band_power']
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#from sklearn.linear_model import LogisticRegressionCV
#X_train, X_test, y_train, y_test = generate_fit_data(la1, fea1, 1, cv=True)
#