Example #1
0
def recursive_feature_selection(info_humans, info_bots, params, scale=False):

    X, y, features, scaler = get_Xy(info_humans, info_bots, scale=scale)

    print "first feature selection by variance test"
    skb = VarianceThreshold(threshold=(.8 * (1 - .8)))
    X_new = skb.fit_transform(X)
    features_1 = features[skb.get_support()]

    print "second feature selection by ch2 test"
    skb = SelectKBest(chi2, k=200)
    # skb = SelectFpr(chi2, alpha=0.005)
    X_new = skb.fit_transform(X_new, y)
    features_2 = features_1[skb.get_support()]

    # skb = PCA(n_components=250)
    # X_new = skb.fit_transform(X_new, y)
    
    print "third feature selection by recursive featue elimination (RFECV)"
    clf = LogisticRegression(penalty=params['penalty'],
                             C=params['C'])
    # clf = SVC(kernel="linear")
    rfecv = RFECV(estimator=clf, step=1,
                  cv=cross_validation.StratifiedKFold(y, 5),
                  scoring='roc_auc', verbose=1)
    rfecv.fit(X_new, y)

    print("Optimal number of features : %d" % rfecv.n_features_)
    
    return skb, rfecv
def main():
    train_df = munge_data('./data/train.csv', False)
    train_df = train_df.drop('PassengerId', axis=1)
    target_df = train_df['Survived']
    train_df = train_df.drop('Survived', axis=1)
    train_df = train_df.sort(axis=1)

    test_df = munge_data('./data/test.csv')
    test_ids = test_df.PassengerId.values
    test_df = test_df.drop('PassengerId', axis=1)
    test_df = test_df.sort(axis=1)
    
    train_data = train_df.values
    target_data = target_df.values
    test_data = test_df.values

    clf = svm.SVC(kernel='linear')
    selector = RFECV(clf, step=1, cv=5, scoring='accuracy')
    
    train_data, cx_data, target_data, cx_target_data = cross_validation.train_test_split(
        train_data, target_data, test_size=0.2)

    selector = selector.fit(train_data, target_data)
    
    print(selector.score(cx_data, cx_target_data))
    cx_predictions = selector.predict(cx_data)
    print(classification_report(cx_target_data, cx_predictions))
    predictions = selector.predict(test_data)

    with open('output.csv', 'w') as o:
        o.write('PassengerId,Survived\n')
        for passenger, prediction in zip(test_ids, predictions):
            o.write('{},{}\n'.format(passenger, prediction))
def test_model(model, xtrain, ytrain, feature_list, prefix):
    """ use train_test_split to create validation train/test samples """
    xTrain, xTest, yTrain, yTest = train_test_split(xtrain, ytrain,
                                                    test_size=0.4)

    if DO_RFECV:
        model.fit(xtrain, ytrain)
        if hasattr(model, 'coef_'):
            model = RFECV(estimator=model, verbose=0, step=1,
                          scoring=score_fn, cv=3)

    model.fit(xTrain, yTrain)
    print 'score', model.score(xTest, yTest)
    ypred = model.predict(xTest)
    ### don't allow model to predict negative number of orders
    if any(ypred < 0):
        print ypred[ypred < 0]
        ypred[ypred < 0] = 0

    print 'RMSE', np.sqrt(mean_squared_error(ypred, yTest))

#    debug_output(model, feature_list)

    debug_plots(model, yTest, ypred, prefix)

    return
Example #4
0
def plotRFECV (X,y,stepSize=0.05,scoring='f1'):
    '''
    Plot recursive feature elimination example with automatic tuning of the number of features selected with cross-validation.
    http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py
    '''
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV

    # Create the RFE object and compute a cross-validated score.
    # svc = SVC(kernel="linear")
    svc = SVC(kernel="linear",class_weight='auto', cache_size=1400)
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=stepSize, cv=StratifiedKFold(y, 2),
                  scoring=scoring)
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    import matplotlib.pyplot as plt
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    return rfecv
Example #5
0
def recursiveFeatureSelectorCV(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFECV(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)
    transformed_test_data = rfe.transform(test_data)
    
    return transformed_train_data,transformed_test_data
Example #6
0
class RFECVSelection(SelectionModel):
    name = "RFECV"

    def __init__(self, *args):
        SelectionModel.__init__(self, *args)
        self.selector = RFECV(self.estimator, step=1, cv=5, scoring='mean_squared_error')
        self.selector.fit(self.x_array, self.y_array)
        self.support_ = self.selector.support_

    def print_rankings(self):
        print("Rankings for: ", RFECVSelection.name)
        for (i, rank) in zip(self.columns, self.selector.ranking_):
            print("{0}: {1}".format(data.column_names[i], rank))

    # number of features vs. cv scores
    def plot_num_of_feat_vs_cv_score(self):
        plt.figure()
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation scores (mse)")
        plt.plot(range(1, len(self.selector.grid_scores_) + 1),
                self.selector.grid_scores_)
        plt.show()

    def plot_rankings(self):
        plt.figure()
        plt.title("Ranking of features in RFECV")
        plt.bar(range(self.x_array.shape[1]), self.selector.ranking_, align="center", color="r")
        plt.xticks(range(self.x_array.shape[1]), [data.column_names[i] for i in self.columns])
        plt.show()
def find_best_features(df_train, y_train):
    rfr = RandomForestRegressor(n_estimators=500, max_depth=6, n_jobs=16)

    # vals_pearson = df_train.corr('pearson').values
    vals_pearson = joblib.load("vals_pearson.pkl")
    # vals_kendall = df_train.corr('kendall').values
    # vals_spearman = df_train.corr('spearman').values
    vals_spearman = joblib.load("vals_spearman.pkl")

    vals = (vals_pearson + vals_spearman) / 2

    dumped_cols = []
    res_cols = [True] * vals.shape[0]
    for i in range(vals.shape[0]):
        if i not in dumped_cols:
            for j in range(vals.shape[1]):
                if i != j:
                    if abs(vals[i, j]) > 0.90:
                        dumped_cols.append(j)
                        res_cols[j] = False

    # df_train2 = df_train[df_train.columns[res_cols]]

    rfecv = RFECV(rfr, step=10, cv=5, scoring=rmse_scorer, verbose=2)  # Float step gives error on the end

    # rfecv.fit(df_train2, y_train)
    rfecv = joblib.load("rfecv.pkl")

    return (res_cols, rfecv.get_support())
def benchmark_features_selection(clf,name):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, 2),
              scoring='accuracy')
    rfecv.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    print(name+"Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")    
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

 
    t0 = time()
    pred = rfecv.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("Saving data to database:")
    save_results_data(cursor, name, testing_identifiant_produit_list, pred)
    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr,train_time,test_time
def select_features(clf, x_train, y_train, columns, num_folds, step=19, random_state=0):
    """
    automatic tuning of the number of features selected with cross-validation.
    :param clf: estimator
    :param x_train:
    :param y_train:
    :return: the fitted rfecv object
    """
    print '================= select_features ================'
    # Create the RFE object and compute a cross-validated score.
    cvObj = KFold(len(y_train), n_folds=num_folds, shuffle=True, random_state=random_state)

    # The "accuracy" scoring is proportional to the number of correct classifications
    rfecv = RFECV(estimator=clf, step=step, cv=cvObj, scoring=scorer, verbose=2)
    rfecv.fit(x_train, y_train)

    print '------------ Results: ----------------'
    print '>>>> Optimal number of features : %d' % rfecv.n_features_
    print '>>>> grid scores:'
    pprint(rfecv.grid_scores_)
    print '>>>> ranking of columns:'
    pprint(np.array(columns)[rfecv.ranking_-1])


    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    return rfecv
Example #10
0
def main():
    args = getOptions()
    print args

    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear")
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10),
                  scoring='accuracy')
    rfecv.fit(train_x_nor, train_y)
    
    print("Optimal number of features : %d" % rfecv.n_features_)
Example #11
0
def run_rfecv(X, y, clf_class, **kwargs):
    clf = clf_class(**kwargs)
    rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy')
    rfecv.fit(X, y)
    plot_rfcev(rfecv)
    print "Optimal number of features : {0} for model: {1}".format(rfecv.n_features_, clf_class)
    return rfecv
Example #12
0
def get_top_features(windows_data_frame, drop_only_almost_positives = False, drop_duplicates = True, transformer = DEFAULT_TRANSFORMER, \
        classifier = RFECV_FEATURE_SELECTION_DEFAULT_CLASSIFIER, n_folds = 3, step = 0.05, scoring = 'f1'):

    '''
    Using sklearn.feature_selection.RFECV model in order to find the top features of given windows with features, given in a CSV format.
    @param windows_data_frame (pandas.DataFrame):
        A data frame of the windows' CSV.
    @param drop_only_almost_positives (boolean, default False):
        Same as in train_window_classifier.
    @param drop_duplicates (boolean, default True):
        Whether to drop duplicating windows in the dataset, based on their neighbourhood property, prior to RFECV.
    @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler):
        A preprocessing transformer to use for the data before applying RFECV. If None, will not perform any preprocessing transformation.
    @param classifier (sklearn classifier, default a special version of random forest suitable for RFECV):
        The classifier to use as the estimator of RFECV.
    @param n_folds (int, default 2):
        The n_folds to use in the kfold cross-validation as part of the RFECV process.
    @param step (default 0.05):
        See sklearn.feature_selection.RFECV
    @param scoring (default 'f1'):
        See sklearn.feature_selection.RFECV
    @return:
        A list of the top features, each represented as a string.
    '''

    features, X, y = get_windows_data(windows_data_frame, drop_only_almost_positives, drop_duplicates, transformer)
    kfold = StratifiedKFold(y, n_folds = n_folds, shuffle = True, random_state = SEED)
    rfecv = RFECV(estimator = classifier, cv = kfold, step = step, scoring = scoring)
    rfecv.fit(X, y)
    return util.apply_mask(features, rfecv.support_)
Example #13
0
def featureSelection(train_x, train_y):
    # Create the RFE object and compute a cross-validated score.
    svc = LinearSVC(C=1, class_weight='balanced')
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    lasso = RandomizedLasso()
    lasso.fit(train_x, train_y)
    rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy')
    rfecv.fit(train_x, train_y)

    print("Optimal number of features : %d" % rfecv.n_features_)
    rankings = rfecv.ranking_
    lasso_ranks = lasso.get_support()
    lassoFeats = []
    recursiveFeats = []
    shouldUseFeats = []

    for i in range(len(rankings)):
        if lasso_ranks[i]:
            lassoFeats.append(feats[i])
        if rankings[i] == 1:
            recursiveFeats.append(feats[i])
            if lasso_ranks[i]:
                shouldUseFeats.append(feats[i])
    keyboard()
    print 'Should use ' + ', '.join(shouldUseFeats)
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
def plot_RFE(X,y):
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    from sklearn.datasets import make_classification
    from sklearn.metrics import zero_one_loss
    import pylab as pl
    import matplotlib.pylab as pl

    # Create the RFE object and compute a cross-validated score.
    # svc= SVC(kernel="linear", class_weight="auto", cache_size=1200, shrinking=True)
    svc=LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto',multi_class='ovr')
#    SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=10,l1_ratio =0.15)
##    rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedKFold(y, 5), scoring='roc_auc')
    rfecv = RFECV(estimator=svc, step=0.2,cv=StratifiedKFold(y, 2), scoring='f1')
    X_RFE = rfecv.fit_transform(X, y)

    print("Optimal number of features in X_RFE : %d" % rfecv.n_features_)
    # Plot number of features VS. cross-validation scores
    pl.figure()
    pl.xlabel("Number of features selected")
    pl.ylabel("Cross validation score (nb of misclassifications)")
    pl.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    pl.show()
    print ('RFE Opt.shapes features CV score:')
    CV_multi_stats(X_RFE,y,svc)
    return (X_RFE,rfecv)
def feature_selection_RFE(fn ,ax=None, sel="all", goal="Referee", verbosity=0, nf=7):
    X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1)
    if verbosity > 1:
        print ("names:", ",".join(names))
    
    # Create the RFE object and compute a cross-validated score.
    #estimator = svm.SVC(kernel="linear",C=1.0)
    estimator = get_clf('svm')    
    scoring = 'f1'
    cv = cross_validation.StratifiedKFold(y, 2)

    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    if True:
        rfecv = RFECV(estimator=estimator, step=1, cv=cv, scoring=scoring)
    else:
        from kgml.rfecv import RFECVp
        f_estimator = get_clf('svm')
        rfecv = RFECVp(estimator=estimator,f_estimator=f_estimator, step=1, cv=cv, scoring=scoring)
        
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        rfecv.fit(X, y)

    # Plot number of features VS. cross-validation scores
    ax.set_xlabel("Number of features selected")
    ax.set_ylabel("Cross validation score ({})".format(scoring))
    ax.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

    #print("Optimal number of features : %d" % rfecv.n_features_)
    best = names[rfecv.ranking_==1]
    #print "The best features:", ', '.join(best)
    return best
Example #16
0
def recursiveFeatureElimination():
	with DB() as db:
		POIs = getPointsOfInterest()
		numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1

		# for hour in xrange(24):
		plt.figure()
		plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
		fignum = 1
		for POI in POIs:
			x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeatures)
			x, y = np.array(x), np.array(y)

			# Create the RFE object and compute a cross-validated score.
			svr = SVR(kernel="linear")
			rfecv = RFECV(estimator=svr, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy')
			rfecv.fit(x, y)

			print("Optimal number of features : %d" % rfecv.n_features_)

			# Plot number of features VS. cross-validation scores
			plt.subplot(numRows, numCols, fignum)
			plt.title(POI['NAME'])
			plt.xlabel("Number of features selected")
			plt.ylabel("Cross validation score (nb of misclassifications)")
			plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

			fignum += 1
	plt.show()
Example #17
0
def plot_rfe(X,label):
    y=X[label]
    X=X.drop(['churn','appetency','upselling',label],axis='columns')
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    # Build a classification task using 3 informative features
#    X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
#                               n_redundant=2, n_repeated=0, n_classes=8,
#                               n_clusters_per_class=1, random_state=0)
    
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear")
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),
                  scoring='accuracy')
    rfecv.fit(X, y)
    
    print("Optimal number of features : %d" % rfecv.n_features_)
    
    # Plot number of features VS. cross-val5idation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
def RFE_featureSelection(X_train,Y_train):
	## Sampling
	RSObj=randomSampling.randomSampling()
	(X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train)
	X_train.reset_index(drop=True,inplace=True)
	Y_train.reset_index(drop=True,inplace=True)

	## Select classifier and parameters
	logistic = linear_model.LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
	          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
	          penalty='l1', random_state=None, solver='liblinear', tol=0.01,
	          verbose=0, warm_start=False)

	## Initialiaze RFE
	rfecv = RFECV(estimator=logistic, step=1, cv=5,
	              scoring='recall')

	## Fit data
	rfecv.fit(X_train, Y_train)

	## Selected Features
	print("Optimal number of features : %d" % rfecv.n_features_)

	## Plot importance
	plt.figure()
	plt.xlabel("Number of features selected")
	plt.ylabel("Cross validation score")
	plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
	plt.show()

	#print('\n Selectd Columns : {0}').format(list(rfecv.support_))
	print('\n Selectd Columns : {0}').format(X_train.columns[list(rfecv.support_)])
	selected_columns = X_train.columns[list(rfecv.support_)]
	return selected_columns
def main():
    print("Loading paths")
    paths = json.loads(open("SETTINGS.json").read())

    print("Getting features for deleted papers from the disk files")
    features_conf = [feature for feature in csv.reader(open(paths["trainpos_features"]))]
    features_deleted = [feature for feature in csv.reader(open(paths["trainneg_features"]))]

    features = np.array([map(float, x[2:]) for x in features_deleted + features_conf])
    target = np.array([0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))])

    '''classifier = RandomForestClassifier(n_estimators=360,
                                        verbose=2,
                                        n_jobs=4,
                                        min_samples_split=10,
                                        random_state=1)
    classifier = SVR(kernel="linear")
    '''
    classifier = LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=1.0,\
                multi_class='ovr',fit_intercept=True, intercept_scaling=1,\
                class_weight=None, verbose=0, random_state=None)

    print("Start feature selection")
    selector = RFECV(classifier, step=1, cv=5)
    print features.shape
    selector = selector.fit(features, target)

    print("Ouput feature selection results")
    print selector.support_
    print selector.ranking_

    writer = csv.writer(open(paths["selection_result"], "w"))
    writer.writerow(selector.support_.tolist())
    writer.writerow(selector.ranking_.tolist())
def select_features(estimator, X_train, y_train):
    log.info("Selecting best features:")
    estimator = SVR(kernel="rbf")
   
    selector = RFECV(estimator, step=1, cv=5)
    selector = selector.fit(X_train, y_train)
    selector.support_
    selector.ranking_
    return selector.ranking_
Example #21
0
File: test.py Project: zedoul/air
def optimalFeatures(train,target):
    sk = StratifiedKFold(target,n_folds=3)
    est = SVC(kernel='linear')
    rfecv = RFECV(est,cv=sk)
    rfecv.fit(train,target)
    print("Optimal number of features : %d" % rfecv.n_features_)
    
    
    return rfecv
def select_features(data, class_attribute):
    X = [[float(v) for v in row[3:]] for row in data[1:]]
    X = preprocessing.MinMaxScaler().fit_transform(X)
    y = [0 if row[class_attribute] == 'low' else 1 for row in data[1:]]
    print("Loaded %d sessions" % len(y))
    svc = LinearSVC(class_weight='auto')
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 10), scoring='f1')
    print("RFE in progress")
    rfecv = rfecv.fit(X, y)
    return rfecv
Example #23
0
def feature_selection(data_matrix, target):
    from sklearn.feature_selection import RFECV
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(average=True, shuffle=True, penalty='elasticnet')
    # perform feature rescaling with elastic penalty
    data_matrix = estimator.fit_transform(data_matrix, target)
    # perform recursive feature elimination
    selector = RFECV(estimator, step=0.1, cv=10)
    data_matrix = selector.fit_transform(data_matrix, target)
    return data_matrix
Example #24
0
def selectBestFeaturesRFECV(samples, classifications,
                            featureNames, classifierClass):
    fs = RFECV(classifierClass.getEstimator())
    if (not sprs.issparse(samples)):
        samples = sprs.csr_matrix(samples)
    samples = fs.fit_transform(samples.toarray(), classifications)
    sup = fs.get_support()
    
    featureNames = [featureNames[i] for (i,s) in enumerate(sup) if s]
    return [samples,featureNames]
def main():
    xtrain=np.load('data/x_train.npy')
    ytrainreg=np.load('data/loss.npy')
    xtrain=xtrain[ytrainreg>0]
    ytrainreg=ytrainreg[ytrainreg>0]
    reg1=linear_model.SGDRegressor(loss='epsilon_insensitive',random_state=0,n_iter=5)
    selector1=RFECV(estimator=reg1,scoring='mean_squared_error',verbose=10)
    selector1.fit(xtrain,np.log(ytrainreg)) #training on the log of the loss
    print "sel1, optimal number of features:", selector1.n_features_
    np.save('features/reg_sel_sgd_eps.npy', selector1.support_)
 def cls_create(xs, ys):
     
     def score_fn(expected, actual):
         r,p,f1 = rpf1(expected, actual)
         return 1.0 - f1
     
     clf = LDA()
     selector = RFECV(clf, step=50, cv=3, loss_func=score_fn)
     selector = selector.fit(xs, ys)
     return selector
Example #27
0
def selectFeatures (clf, X, Y):
    # Create the RFE object and compute a cross-validated score.
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(Y, 5),
                  scoring='accuracy')
    rfecv.fit(X, Y)
    lst = rfecv.get_support()
    indices = find(lst, True)
    return X[:, indices], indices
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./hotness_features_classes.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )
    train_scaled = preprocessing.scale(train)
    clf = SVC(kernel='linear')
    selector = RFECV(clf, step=1, cv=10)
    selector = selector.fit(train_scaled, target)
    print selector.support_
def recursive_fs_cv(X, y, clf):
    # create the RFE model and select 3 attributes
    rfe = RFECV(clf, step=1, cv=5)

    start = time.time()
    rfe = rfe.fit(X, y)
    # summarize the selection of the attributes
    end = time.time()
    print ("Training Time: " + str((end - start)) + "s")
    return rfe
Example #30
0
def featureSelection(X,y):
	class RandomForestClassifierWithCoef(RandomForestClassifier):
	    def fit(self, *args, **kwargs):
	        super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
	        self.coef_ = self.feature_importances_
	randfor = RandomForestClassifierWithCoef(n_estimators=35)
	rfecv = RFECV(estimator=randfor, step=1, cv=5,
	               scoring='accuracy',verbose=2)
	rfecv.fit(X,y)
	return X.columns[rfecv.get_support()]
Example #31
0
def test_rfecv():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)  # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert len(rfecv.grid_scores_) == X.shape[1]
    assert len(rfecv.ranking_) == X.shape[1]
    X_r = rfecv.transform(X)

    # All the noisy variable were filtered out
    assert_array_equal(X_r, iris.data)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Test using a customized loss function
    scoring = make_scorer(zero_one_loss, greater_is_better=False)
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scoring)
    ignore_warnings(rfecv.fit)(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test using a scorer
    scorer = get_scorer('accuracy')
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test fix on grid_scores
    def test_scorer(estimator, X, y):
        return 1.0

    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=test_scorer)
    rfecv.fit(X, y)
    assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
    # In the event of cross validation score ties, the expected behavior of
    # RFECV is to return the FEWEST features that maximize the CV score.
    # Because test_scorer always returns 1.0 in this example, RFECV should
    # reduce the dimensionality to a single feature (i.e. n_features_ = 1)
    assert rfecv.n_features_ == 1

    # Same as the first two tests, but with step=2
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=2)
    rfecv.fit(X, y)
    assert len(rfecv.grid_scores_) == 6
    assert len(rfecv.ranking_) == X.shape[1]
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Verifying that steps < 1 don't blow up.
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)
Example #32
0
df_step5 = df_gini
df_step5.info() # Get class, memory, and column info: names, data types, obs.
df_step5.head() # Print first 5 observations

### Step 6: Recursive Feature Elimination

### Collect features from RF and PC
df_pc_gini = pd.merge(df_pc, df_gini, on = "Features", how = "inner") # Join by column while keeping only items that exist in both, select outer or left for other options
pc_gini_features = df_pc_gini["Features"].tolist() # Save features from data frame
df_rfecv = df_step3[pc_gini_features] # Add selected features to df

### Setup RFE model
X = df_rfecv # Save features columns as predictor data frame
Y = df_step3["outcome"] # Use outcome data frame 
RFE = LinearRegression() # Use regression coefficient as estimator
selector = RFECV(estimator = RFE, min_features_to_select = 5) # define selection parameters, in this case all features are selected. See Readme for more ifo

### Fit RFE model
selected = selector.fit(X, Y) # This will take time

### Collect features from RFE model
ar_rfe = selected.support_ # Save Boolean values as numpy array
l_rfe = list(zip(X, ar_rfe)) # Create list of variables alongside RFE value 
df_rfe = pd.DataFrame(l_rfe, columns = ["Features", "RFE"]) # Create data frame of importances with variables and gini column names
df_rfe = df_rfe[df_rfe.RFE == True] # Select Variables that were True
df_rfe = df_rfe.reset_index() # Reset Index
df_rfe = df_rfe.filter(["Features"]) # Keep only selected columns

### Rename and Verify
df_step6 = df_rfe
df_step6.info() # Get class, memory, and column info: names, data types, obs.
Example #33
0
    def default_pipeline(self,
                         name,
                         n_pca=10,
                         n_best=10,
                         lda_shrink=10,
                         svm_C=10,
                         svm_gamma=10,
                         fdr_alpha=[0.05],
                         fpr_alpha=[0.05]):
        """Use a default combination of parameters for building a pipeline

        Args:
            name: string
                The string for building a default pipeline (see examples below)

        Kargs:
            n_pca: integer, optional, (def: 10)
                The number of components to search

            n_best: integer, optional, (def: 10)
                Number of best features to consider using a statistical method

            lda_shrink: integer, optional, (def: 10)
                Fit optimisation parameter for the lda

            svm_C/svm_gamma: integer, optional, (def: 10/10)
                Parameters to optimize for the svm

            fdr/fpr_alpha: list, optional, (def: [0.05])
                List of float for selecting features using a fdr or fpr

        Examples:
            >>> # Basic classifiers :
            >>> name = 'lda' # or name = 'svm_linear' for a linear SVM
            >>> # Combine a classifier with a feature selection method :
            >>> name = 'lda_fdr_fpr_kbest_pca'
            >>> # The method above will use an LDA for the features evaluation
            >>> # and will combine a FDR, FPR, k-Best and pca feature seelction.
            >>> # Now we can combine with classifier optimisation :
            >>> name = 'lda_optimized_pca' # will try to optimize an LDA with a pca
            >>> name = 'svm_kernel_C_gamma_kbest' # optimize a SVM by trying
            >>> # diffrent kernels (linear/RBF), and optimize C and gamma parameters
            >>> # combine with a k-Best features selection.
        """
        # ----------------------------------------------------------------
        # DEFINED COMBINORS
        # ----------------------------------------------------------------
        pca = PCA()
        selection = SelectKBest()
        scaler = StandardScaler()
        fdr = SelectFdr()
        fpr = SelectFpr()

        # ----------------------------------------------------------------
        # RANGE DEFINITION
        # ---------------------------------------------------------
        pca_range = np.arange(1, n_pca + 1)
        kbest_range = np.arange(1, n_best + 1)
        C_range = np.logspace(-5, 15, svm_C,
                              base=2.)  #np.logspace(-2, 2, svm_C)
        gamma_range = np.logspace(-15, 3, svm_gamma,
                                  base=2.)  #np.logspace(-9, 2, svm_gamma)

        # Check range :
        if not kbest_range.size: kbest_range = [1]
        if not pca_range.size: pca_range = [1]
        if not C_range.size: C_range = [1.]
        if not gamma_range.size: gamma_range = ['auto']

        # ----------------------------------------------------------------
        # DEFINED PIPELINE ELEMENTS
        # ----------------------------------------------------------------
        pipeline = []
        grid = {}
        combine = []

        # ----------------------------------------------------------------
        # BUILD CLASSIFIER
        # ----------------------------------------------------------------
        # -> SCALE :
        if name.lower().find('scale') != -1:
            pipeline.append(("scaler", scaler))

        # -> LDA :
        if name.lower().find('lda') != -1:

            # Default :
            if name.lower().find('optimized') == -1:
                clf = LinearDiscriminantAnalysis(
                    priors=np.array([1 / self._nclass] * self._nclass))

            # Optimized :
            elif name.lower().find('optimized') != -1:
                clf = LinearDiscriminantAnalysis(priors=np.array(
                    [1 / self._nclass] * self._nclass),
                                                 solver='lsqr')
                grid['clf__shrinkage'] = np.linspace(0., 1., lda_shrink)

        # -> SVM :
        elif name.lower().find('svm') != -1:

            # Linear/RBF standard kernel :
            if name.lower().find('linear') != -1:
                kwargs = {'kernel': 'linear'}
            elif name.lower().find('rbf') != -1:
                kwargs = {'kernel': 'rbf'}
            else:
                kwargs = {}

            # Optimized :
            if name.lower().find('optimized') != -1:

                # Kernel optimization :
                if name.lower().find('kernel') != -1:
                    grid['clf__kernel'] = ('linear', 'rbf')

                # C optimization :
                if name.lower().find('_c_') != -1:
                    grid['clf__C'] = C_range

                # Gamma optimization :
                if name.lower().find('gamma') != -1:
                    grid['clf__gamma'] = gamma_range

            clf = SVC(**kwargs)

        # ----------------------------------------------------------------
        # BUILD COMBINE
        # ----------------------------------------------------------------
        # -> FDR :
        if name.lower().find('fdr') != -1:
            combine.append(("fdr", fdr))
            grid['features__fdr__alpha'] = fdr_alpha

        # -> FPR :
        if name.lower().find('fpr') != -1:
            combine.append(("fpr", fpr))
            grid['features__fpr__alpha'] = fpr_alpha

        # -> PCA :
        if name.lower().find('pca') != -1:
            combine.append(("pca", pca))
            grid['features__pca__n_components'] = pca_range

        # -> kBest :
        if name.lower().find('kbest') != -1:
            combine.append(("kBest", selection))
            grid['features__kBest__k'] = kbest_range

        # -> RFECV :
        if name.lower().find('rfecv') != -1:
            rfecv = RFECV(clf)
            combine.append(("RFECV", rfecv))

        # if combine is empty, select all features :
        if not len(combine):
            combine.append(("kBest", SelectKBest(k='all')))

        self.combine = FeatureUnion(combine)

        # ----------------------------------------------------------------
        # SAVE PIPELINE
        # ----------------------------------------------------------------
        # Build ordered pipeline :
        if len(combine):
            pipeline.append(("features", self.combine))
        pipeline.append(("clf", clf))

        # Save pipeline :
        self.pipeline = Pipeline(pipeline)
        self.grid = grid
        self._pipename = name
Example #34
0
def rfecv_features(X, y, rfecv_params):
    """
    Feature ranking with recursive feature elimination and cross-validated 
    selection of the best number of features. Determines the minimum number
    of features that are needed to maxmize the model's performance. 
    
    Parameters
    ----------
    X : pandas dataframe
        A data set where each row is an observation and each column a feature.
        
    y: numpy array
        A numpy array containing the targets
    
    rfecv_params: dict,
        A dictionary containing the set of parameters use to initialize RFECV sklearn
        class.
    
    
    Examples
    --------
    # Initialize estimator
    estimator = RandomForestClassifier()
    
    # Define RFECV parameters
    rfecv_params = {'estimator': estimator,
                    'cv': 2,
                    'step': 1,
                    'scoring': 'accuracy',
                    'verbose': 50}
    
    # Get rfecv feature labels
    labels = rfecv_features(X = X, y = y, rfecv_params = rfecv_params)
    
    
    Returns
    -----
    labels: list
        A list with the labels identifying the subset of features needed
        to maximize the model's performance.
    
    feature_selector: fitted RFECV object
    
    
    References
    ----------
    Find more details about Boruta here:
    https://github.com/scikit-learn-contrib/boruta_py
    
    """
    
    # Initialize RFECV object
    feature_selector = RFECV(**rfecv_params)
    
    # Fit RFECV
    feature_selector.fit(X, y)
    
    # Get selected features
    feature_labels = X.columns
    
    # Get selected features
    labels = feature_labels[feature_selector.support_].tolist()
    
    return labels, feature_selector
Example #35
0
# meanwhile, we split the traindata as well to loop so that largely reduced overfitting.
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, r2_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# divide the training dataset into splits folds, using len(splits)-1 for training, and 1 for validation.
# repeat such dividing for 'repeats' times. The overall outputs fit model is splits*repeats
rskf = RepeatedStratifiedKFold(n_splits=splits,
                               n_repeats=repeats,
                               random_state=random_seed)
# rskf=StratifiedShuffleSplit(n_splits=splits, test_size=test_size,
#                        random_state=random_seed)
# select the important features for a specific estimator.create a feature selector
feature_selector = RFECV(gridsearch.best_estimator_,
                         step=steps,
                         cv=cv,
                         scoring=myScorer)

# start loop for the model fitting for every single split. total loop number: splits*repeats
counter = 0
predictions = pd.DataFrame()
print(
    "counter,mse, mae, auc,r2, grid_search.best_score_, feature_selector.n_features_, message "
)
for train_index, val_index in rskf.split(X_train, y_train):
    # train a model for every single split.
    #select train_index rows of data for training
    X, X_val = X_train[train_index], X_train[val_index]
    y, y_val = y_train[train_index], y_train[val_index]

    # # fit the RFE model and automatically tune the number of selected.
    def predict_features(self, df_features, df_target, idx=0, **kwargs):
        estimator = SVR(kernel='linear')
        selector = RFECV(estimator, step=1)
        selector = selector.fit(df_features.as_matrix(), df_target.as_matrix()[:, 0])

        return selector.grid_scores_
# ### **Recursive feature elimination with cross validation and random forest classification**

# Now, we will find how many atributtes do we need for best accuracy

# In[ ]:

X = train_df.drop(['Survived'], axis=1)
y = train_df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# In[ ]:

rf = RandomForestClassifier()
rfecv = RFECV(estimator=rf, step=1, cv=5,
              scoring='accuracy')  #5-fold cross-validation
rfecv = rfecv.fit(X_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', X_train.columns[rfecv.support_])

# In[ ]:

X = train_df.drop(['Survived', 'Embarked'], axis=1)

# ## **Model selection**

# Ok, we have many models and algorithms. First, we gonna try with all features.

# In[ ]:
Example #38
0
# Diccionario que mapea la RFE Accuracy con un índice
dict_1 = {}
# Diccionario que mapea un índice con el objeto RFECV
dict_2 = {}

time_prebucle = time.time()

# Itero sobre los posibles valores de C
for i, c in enumerate(C):
    time_temp1 = time.time()
    clf_temp = SVC(C=c,
                   kernel=kernel,
                   class_weight=class_weight,
                   random_state=random_state)
    rfecv_temp = RFECV(clf_temp, cv=skf, scoring=scoring)
    rfecv_temp.fit(X, y)
    dict_1[rfecv_temp.grid_scores_[rfecv_temp.n_features_]] = i
    dict_2[i] = rfecv_temp
    time_temp2 = time.time()
    print(f'Time iteration {i}: {time_temp2-time_temp1}')

time_bucle = time.time()
print(f'Time loop: {time_bucle-time_prebucle}')

maximo = max(dict_1)
indice_maximo = dict_1[maximo]
rfecv = dict_2[indice_maximo]
best_c = rfecv.estimator_.get_params()['C']
print(f'Best C: {best_c}')
Example #39
0
def test_RFECV():
    from sklearn.datasets import load_boston
    from sklearn.datasets import load_breast_cancer
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import RFECV

    # Regression
    X, y = load_boston(return_X_y=True)
    bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1,
                            n_estimators=10, n_jobs=1,
                            objective='reg:squarederror',
                            random_state=0, verbosity=0)
    rfecv = RFECV(
        estimator=bst, step=1, cv=3, scoring='neg_mean_squared_error')
    rfecv.fit(X, y)

    # Binary classification
    X, y = load_breast_cancer(return_X_y=True)
    bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1,
                            n_estimators=10, n_jobs=1,
                            objective='binary:logistic',
                            random_state=0, verbosity=0)
    rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='roc_auc')
    rfecv.fit(X, y)

    # Multi-class classification
    X, y = load_iris(return_X_y=True)
    bst = xgb.XGBClassifier(base_score=0.4, booster='gblinear',
                            learning_rate=0.1,
                            n_estimators=10, n_jobs=1,
                            objective='multi:softprob',
                            random_state=0, reg_alpha=0.001, reg_lambda=0.01,
                            scale_pos_weight=0.5, verbosity=0)
    rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='neg_log_loss')
    rfecv.fit(X, y)
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

    # Cross-validation for SVC
    if clf_choice == 'svc':
        from sklearn.cross_validation import StratifiedKFold
        from sklearn.feature_selection import RFECV
        # Recursive feature elimination with cross-validation
        svc_clf = RFECV(estimator=clf,
                        step=1,
                        cv=StratifiedKFold(labels_train, n_folds=10),
                        scoring='accuracy')
        svc_clf = svc_clf.fit(features_train, labels_train)
        print("Optimal number of features: %d" % svc_clf.n_features_)
        print(svc_clf.support_)
        print(svc_clf.ranking_)

    # Cross-validation for DecisionTreeClassifier
    if clf_choice == 'dtc':
        from sklearn.cross_validation import StratifiedKFold
        from sklearn.grid_search import GridSearchCV
        param_grid = {
            'criterion': ['gini', 'entropy'],
            'max_features': ['auto', 'sqrt', 'log2', None],
            'max_depth': [4, 5, 6, 7, 8, None],
            'min_samples_split': [2, 3, 4, 5],
# Make a fake patient by randomly selecting a value from each feature
fake_patient = X.apply(np.random.choice, axis=0)
fake_prediction = best_classifier.predict(np.array([fake_patient.to_numpy()]))
# -

# ### How to account for data collection cost for each feature?

# #### Try recursive feature elmination with CV

# RFECV attempts to select the best combination of features by fitting models (with 5-fold CV) recursively eliminating a feature at a time.

# +
rfecv = RFECV(estimator=LogisticRegression(
    solver='liblinear',
    C=best_classifier.get_params()['C'],
    penalty=best_classifier.get_params()['penalty'],
    random_state=48),
              step=1,
              cv=5,
              scoring='f1')
rfecv.fit(X, y)

print('Recommended to select the following {} features:'.format(
    rfecv.n_features_))
print(X.columns[rfecv.support_])

best_feature_subset_classifier = LogisticRegression(
    solver='liblinear',
    C=best_classifier.get_params()['C'],
    penalty=best_classifier.get_params()['penalty'],
    random_state=48)
best_feature_subset_classifier.fit(X_train, y_train)
Example #42
0
# -*- coding: utf-8 -*-
import scipy.io
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

mat = scipy.io.loadmat("./input/arcene.mat")
y_train = np.ravel(mat["y_train"])
y_test = np.ravel(mat["y_test"])
X_train = mat["X_train"]
X_test = mat["X_test"]

est = LogisticRegression(solver="lbfgs")

rfe = RFECV(estimator=est, step=50, verbose=1)
rfe = rfe.fit(X_train, y_train)

rfe.support_

plt.plot(range(0, 10001, 50), rfe.grid_scores_)

score = accuracy_score(y_test, rfe.predict(X_test))

print('Test accuracy', score)
Example #43
0
print("Shape X matrix: ", x.shape)
print("prop: ", y.value_counts() / y.shape[0])

#validation set
data2 = pd.read_csv("data\\val_imp.csv", header=0)
y_v = data2.iloc[:, -1]
x_v = data2.iloc[:, :-1]
print("Shape X_v matrix: ", x_v.shape)

#############################
#Feature selection
#############################
#setting up feature selection algorithm
k_fold = KFold(n_splits=10)
est = LogisticRegression()
selector = RFECV(est, cv=k_fold)
selector.fit(x, y)
#keeping selected variables and printing names for control
x_b = x.loc[:, selector.get_support()]
xv_b = x_v.loc[:, selector.get_support()]
print("Optimal number of features : %d" % selector.n_features_)
print("Support", x.loc[:, selector.get_support()].columns)
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
plt.savefig("plots\\featknn.pdf", bbox_inches='tight')
plt.close()

##############################
Example #44
0
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

arcene = loadmat('arcene.mat')

X_train = arcene['X_train']
X_test = arcene['X_test']
y_train = arcene['y_train']
y_test = arcene['y_test']

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

model = LogisticRegression()

selector = RFECV(model, step=50, verbose=1)
selector.fit(X_train, y_train)

count = 0
for x in selector.support_:
    if x:
        count += 1

plt.plot(range(0, 10001, 50), selector.grid_scores_)

y_pred = selector.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)
Example #45
0
        kepler_X_trans = trans.fit_transform(train_X, train_y)
        columns_retained_RFE = train_X.iloc[:, :].columns[
            trans.get_support()].values
        print('Cols to keep:', columns_retained_RFE)

        clf = linear_model.LinearRegression().fit(
            train_X[columns_retained_RFE], train_y)
        stats.summary(clf, oos_X[columns_retained_RFE], oos_y,
                      columns_retained_RFE)
        print('Train R:', clf.score(train_X[columns_retained_RFE], train_y))
        print('OOS R:', clf.score(oos_X[columns_retained_RFE], oos_y))

        print('############ RFECV ############')

        clf = linear_model.LinearRegression()
        trans = RFECV(clf)
        kepler_X_trans = trans.fit_transform(train_X, train_y)
        columns_retained_RFECV = train_X.iloc[:, :].columns[
            trans.get_support()].values
        print('Cols to keep:', columns_retained_RFECV)

        clf = linear_model.LinearRegression().fit(
            train_X[columns_retained_RFECV], train_y)
        stats.summary(clf, oos_X[columns_retained_RFECV], oos_y,
                      columns_retained_RFECV)
        print('Train R:', clf.score(train_X[columns_retained_RFECV], train_y))
        print('OOS R:', clf.score(oos_X[columns_retained_RFECV], oos_y))

        #clf = linear_model.BayesianRidge() #LinearRegression()
        #clf.fit(train_X[columns_retained_RFECV], train_y)
        #print('train R:', clf.score(train_X[columns_retained_RFECV], train_y))
Example #46
0
    features = data[list(data.columns)[:-1]]
    features = features.to_numpy()
    #select the labels
    labels = data[list(data.columns)[-1]]
    labels = labels.to_numpy()
    #60-40 train-test split
    numTrain = int(0.6*len(features))
    trainData = features[:numTrain]
    trainLbl = labels[:numTrain]
    testData = features[numTrain:]
    testLbl = labels[numTrain:]

    clf = RandomForestClassifier(n_estimators=30, max_depth=20, n_jobs=-1, random_state=42)

    #cv = None -> defaults to 5-fold cross validation
    rfecv = RFECV(estimator=clf, step=1, cv=None, scoring='f1_weighted')

    #5-fold on the whole dataset
    rfecv.fit(features, labels)

    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (F-score)")
    fig_feat = plt.gcf()
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
    plt.draw()
    fig_feat.savefig('%s.png'%(graph_file), bbox_inches='tight')

    numImportantFeatures = rfecv.n_features_
    print("Optimal number of features : %d" % numImportantFeatures)
    def perform(emotion, train_tweets, y_train, task_name, estimator_dict):
        #Select the scoring metric, depending upon the task name
        scoring = Dictionaries.scoring.get(task_name)

        # Perform the preprocessing and feature engineering tasks
        preprocess_train_df = Preprocessor.perform(train_tweets, emotion,
                                                   'train', task_name)
        X_train = Feature_Transformer.perform(preprocess_train_df, emotion,
                                              'train', task_name)

        #Iterate through all the estimators
        for estimator_name, estimator in estimator_dict.items():
            #pipeline for original data
            pipeline = make_pipeline(
                MinMaxScaler(feature_range=(0, 1), copy=True),
                RFECV(estimator, step=1, cv=5, scoring=scoring, n_jobs=-1))

            scores = cross_validate(pipeline,
                                    X_train,
                                    y_train,
                                    scoring=scoring,
                                    cv=5,
                                    return_train_score=False)
            print(scores)
            pipeline.fit(X_train, y_train)

            print(pipeline.steps)
            #Get number of features selected, the features selected and its ranking
            selected_features = pipeline.steps[1][1].n_features_
            feature_mask = pipeline.steps[1][1].support_
            feature_rank = pipeline.steps[1][1].ranking_

            # Classification task
            if (task_name == 'c'):
                #Get F1 scores
                cv_feature_scores = pipeline.steps[1][1].grid_scores_  # f1

                Writer.write_class_feat_rank_anal_results_in_file(
                    emotion, 'original', estimator_name, selected_features,
                    feature_mask, feature_rank, cv_feature_scores)
                # Pipeline with resamplers - SMOTE, TomekLinks, SMOTETomek
                for resampler_name, resampler in Dictionaries.resampler_dict.items(
                ):
                    #pipeline for resampling
                    pipeline = make_pipeline_imb(
                        MinMaxScaler(feature_range=(0, 1), copy=True),
                        resampler,
                        RFECV(estimator,
                              step=1,
                              cv=5,
                              scoring=scoring,
                              n_jobs=-1))

                    # Fit the pipeline with data
                    pipeline.fit(X_train, y_train)

                    print(pipeline.steps)
                    selected_features = pipeline.steps[2][1].n_features_
                    feature_mask = pipeline.steps[2][1].support_
                    feature_rank = pipeline.steps[2][1].ranking_
                    cv_feature_scores = pipeline.steps[2][1].grid_scores_  # f1

                    Writer.write_class_feat_rank_anal_results_in_file(
                        emotion, resampler_name, estimator_name,
                        selected_features, feature_mask, feature_rank,
                        cv_feature_scores)
                    gc.collect()
            # Regression task
            if (task_name == 'r'):
                #Get rmse scores
                cv_feature_scores = np.sqrt(-pipeline.steps[1][1].grid_scores_
                                            )  # sqrt(-neg_mean_squared_error)

                Writer.write_reg_feat_rank_anal_results_in_file(
                    emotion, estimator_name, selected_features, feature_mask,
                    feature_rank, cv_feature_scores)
                gc.collect()
Example #48
0
#%%
fimp.sort_values().head(10).index

#%%
from sklearn.feature_selection import RFE
rfe = RFE(estimator=logreg, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_
#%%
fimp = pd.Series(ranking, index = viz.features_)
fimp.sort_values(ascending=True).head(20).index

#%%
from sklearn.feature_selection import RFECV

rfecv = RFECV(estimator=logreg, step=1, cv=2, n_jobs=-1, scoring='accuracy')
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

#%%
ranking = rfecv.ranking_
fimp = pd.Series(ranking, index = features)
fimp.sort_values(ascending=True).head(rfecv.n_features_).index
Example #49
0
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

df_train = pd.read_csv("train.csv")
feats = df_train.drop("revenue", axis=1)

X = feats.values  #features
y = df_train["revenue"].values  #target

# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc,
              step=1,
              cv=StratifiedKFold(y, 2),
              scoring='accuracy')

t = StandardScaler()
X = t.fit_transform(X)
y = t.fit_transform(y)

count = 0
for elem in y:
    print(elem)
    count += 1
    if count > 10:
        break

count = 0
for elem in X:
    param_names = sorted(param_grid)
    combinations = list(itertools.product(*(param_grid[name] for name in param_names)))
    print("Grid contains {} hyper-parameter combinations...".format(len(combinations)))

    result_recorder = []
    featr_support = []
    rfetr_num = []

    counter = 1

    for i in combinations:
        print("CV for {} set hyperparameters...".format(counter))
        tmp_hyper = dict(zip(param_names, i))
        rfr_hyper = RandomForestRegressor(**tmp_hyper,random_state = random_state)
        rfe_cv = RFECV(estimator=rfr_hyper, step=1, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, verbose=False)

        rfe_cv.fit(X_train,y_train)
        result_recorder.append(rfe_cv.grid_scores_)  # record mean of CV
        featr_support.append(rfe_cv.support_)  # record selected features of CV
        rfetr_num.append(rfe_cv.n_features_)

        counter +=1

    result_recorder = np.array(result_recorder) * -1
    all_rmse = np.min(result_recorder,axis=0)  # record smallest RMSE

    rfecv_res = pd.DataFrame(np.column_stack([np.arange(1,len(all_rmse)+1),all_rmse]),
                columns = ["Number of features", "Mean RMSE"])

    rfecv_res.to_csv("rfecv_results/rfecv_term_{}.csv".format(term_type))  # for plotting purpose
Example #51
0
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc

for piece in range(7, 8):

    feature = brk_ext(piece)
    train_data = feature.loc[feature['train_test'] == 1]
    train_data.loc[(train_data['activities'] == 0) |
                   (train_data['activities'] == 1), 'activities'] = 0
    train_data.loc[(train_data['activities'] != 0) &
                   (train_data['activities'] != 1), 'activities'] = 1

    col = []
    for i in range(0, 6 * piece):
        col.append("max" + str(i + 1))
        col.append("mean" + str(i + 1))
        col.append("std" + str(i + 1))
    train_target = train_data.iloc[:, -1]
    train_data = train_data[col]
    train_data.to_csv("./f.csv", index=False, header=True)
    train_target.to_csv("./ff.csv", index=False, header=True)
    model = LogisticRegression(max_iter=20)
    clf = RFECV(model, step=1, cv=5, n_jobs=-1)
    clf = clf.fit(train_data, train_target)
    row, p = train_data.iloc[:, clf.get_support()].shape
    accuracy = clf.score(train_data, train_target)
    f1 = f1_score(train_target, clf.predict(train_data))
    print("piece: ", piece, "  best p :", p, "accuracy: ", accuracy,
          "  F1-score :", f1)
Example #52
0
    classifier.fit(train_feature_data, train_class_data)

    # predict using model and test data
    test_predicted_data = classifier.predict(test_feature_data)

    # calculatre metrics
    print('score={}'.format(
        accuracy_score(test_class_data, test_predicted_data)))
    print(confusion_matrix(test_class_data, test_predicted_data))
    print(classification_report(test_class_data, test_predicted_data))

    # using RFE (Recursive Feature Estimation)
    print('Logistic Regression classifier after using RFECV')
    classifier = LogisticRegression(max_iter=10000, solver='lbfgs')
    rfecv = RFECV(estimator=classifier,
                  step=1,
                  cv=StratifiedKFold(2),
                  scoring='accuracy')

    rfecv_data = rfecv.fit_transform(feature_data_processed, class_data)

    # Plot number of features VS. cross-validation scores
    plot.figure()
    plot.xlabel("Number of features selected")
    plot.ylabel("Cross validation score (nb of correct classifications)")
    plot.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plot.show()

    selected_columns = rfecv.get_support(indices=True)
    columns_rfecv = [
        feature_data_processed.columns[selected]
        for selected in selected_columns
def RecursiveFeatureSelectionCrossValidated(model):
    rfe = RFECV(model, cv=5, step=1)
    return rfe
    MI_selector = SelectKBest(mutual_info_classif, k=5)
    X_train_MI = MI_selector.fit_transform(X_train, y_train)
    print("MI scores: {}, MI p-values: {}".format(MI_selector.scores_,
                                                  MI_selector.pvalues_))
    table1_output += "MI scores: {}, MI p-values: {}".format(
        MI_selector.scores_, MI_selector.pvalues_)

    #estimator for recursive feature elimination
    # estimator = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', class_weight= None, max_features = None, random_state = 42,n_jobs=-1)
    estimator = DecisionTreeClassifier(random_state=42)
    #inner cv for recursive feature elimination
    inner_cv = StratifiedShuffleSplit(n_splits=2,
                                      test_size=0.2,
                                      random_state=42)
    RFE_selector = RFECV(estimator, step=1, cv=inner_cv, n_jobs=-1)
    X_train_RFE = RFE_selector.fit_transform(X_train, y_train)
    print("RFE rankings: {}, RFE grid-scores: {}".format(
        RFE_selector.ranking_, RFE_selector.grid_scores_))
    table1_output += "RFE rankings: {}, RFE grid-scores: {}".format(
        RFE_selector.ranking_, RFE_selector.grid_scores_)

    ############# TEST RESULTS FOR FEATURE SELECTION METHODS #############
    fold_info = {
        model_name: {
            'FULL': {score_name: 0
                     for score_name, score in scoring},
            'ANOVA': {score_name: 0
                      for score_name, score in scoring},
            'MI': {score_name: 0
                   for score_name, score in scoring},
Example #55
0
print(X.shape)
clf = LinearSVC(penalty='l2',dual=False)
scores = cross_validation.cross_val_score(clf, X, data[1], cv=10)

print(scores)
print("L2 SVM trained on the features selected by the L1 SVM. \n  Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))




#L2 SVM that use the class RFECV which automatically selects the number of features

clf = LinearSVC(penalty='l2',dual=False)
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(data[1], 2),scoring='accuracy')
rfecv.fit(data[0], data[1])
#scores = cross_validation.cross_val_score(rfecv, data[0], data[1], cv=10)
print("Optimal number of features : %d" % rfecv.n_features_)
scores = rfecv.grid_scores_
print(scores)
print("L2 SVM that use the class RFECV which automatically selects the number of features. \n Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

'''
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
'''
Example #56
0
                           multi_class='ovr')), ['<NAME0>', '<NAME2>']),
    (SelectFromModel(
        PermutationImportance(
            LogisticRegression(solver='liblinear', random_state=42),
            cv=5,
            random_state=42,
            refit=False,
        ),
        threshold=0.1,
    ), ['<NAME2>', '<NAME3>']),
    (RFE(LogisticRegression(solver='liblinear',
                            random_state=42,
                            multi_class='ovr'),
         n_features_to_select=2), ['<NAME1>', '<NAME3>']),
    (RFECV(LogisticRegression(
        solver='liblinear', random_state=42, multi_class='ovr'),
           cv=3), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']),
] + _additional_test_cases)
def test_transform_feature_names_iris(transformer, expected, iris_train):
    X, y, _, _ = iris_train
    transformer.fit(X, y)
    # Test in_names being provided
    res = transform_feature_names(transformer,
                                  ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>'])
    assert res == expected
    # Test in_names being None
    expected_default_names = [
        re.sub('<NAME([0-9]+)>', r'x\1', name) for name in expected
    ]
    assert transform_feature_names(transformer, None) == expected_default_names
Example #57
0
 def fit(self, X, Y):
     params = self.get_params()
     model = sk_SVR(**params)
     self.rfe = RFECV(model)
     self.rfe.fit(X, Y)
Example #58
0
del data_test['fmri_select']

######################################################################################################################

import pandas as pd
from pandas import Series
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

class RandomForestClassifierWithCoef(RandomForestClassifier):
    def fit(self, *args, **kwargs):
        super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
        self.coef_ = self.feature_importances_


#fitting on train 
rf = RandomForestClassifierWithCoef(n_estimators=1000, min_samples_leaf=5, n_jobs=-1)
rfecv = RFECV(estimator=rf, step=1, cv=3, scoring='accuracy', verbose=30)
selector=rfecv.fit(data_train._get_numeric_data(), labels_train)


# collect only the important features
df_important_features_train = data_train._get_numeric_data()[data_train._get_numeric_data().columns[rfecv.get_support()]]
df_important_features_test = data_test._get_numeric_data()[data_test._get_numeric_data().columns[rfecv.get_support()]]
# selector.get_support()

# accuracy in test
from sklearn.metrics import  accuracy_score
accuracy_score(labels_test,rfecv.predict(data_test._get_numeric_data())

Example #59
0
number of features selected with cross-validation.
"""
print __doc__

from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.metrics import zero_one

# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000, n_features=25, n_informative=3,
        n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1,
        random_state=0)

# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2),
        loss_func=zero_one)
rfecv.fit(X, y)

print "Optimal number of features : %d" % rfecv.n_features_

# Plot number of features VS. cross-validation scores
import pylab as pl
pl.figure()
pl.xlabel("Number of features selected")
pl.ylabel("Cross validation score (nb of misclassifications)")
pl.plot(xrange(1, len(rfecv.cv_scores_) + 1), rfecv.cv_scores_)
pl.show()
    p = precision_score(labels_test, labels_pred, average='micro')
    r = recall_score(labels_test, labels_pred, average='micro')
    if p > 0.3 and r > 0.3:
        return f1_score(labels_test, labels_pred, average='macro')
    return 0


#Recursive Feature Selection

import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
clf = DecisionTreeClassifier(max_depth=5)
rfecv = RFECV(estimator=clf,
              step=1,
              cv=StratifiedKFold(labels, 50),
              scoring='precision')
rfecv.fit(features, labels)
print("Optimal number of features : %d" % rfecv.n_features_)
print rfecv.support_
features = features[:, rfecv.support_]
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
# DecisionTreeClassifier tuning
t0 = time()
parameters = {
    'max_depth': [1, 2, 3, 4, 5, 6, 8, 9, 10],