Ejemplo n.º 1
0
def df5():
    from sklearn.feature_selection import RFE
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    import time
    from sklearn.linear_model import LogisticRegression

    select = RFE(RandomForestClassifier(n_estimators=100,random_state=42),
                 n_features_to_select=40)
    cancer = load_breast_cancer()
    rng = np.random.RandomState(42)
    noise = rng.normal(size=(len((cancer.data)), 50))
    X_w_noise = np.hstack([cancer.data, noise])
    X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target, random_state=0, test_size=.5)

    start_time = time.time()
    select.fit(X_train,y_train)
    print("Estimated execution time: {} seconds".format((time.time()-start_time)))

    X_train_rfe = select.transform(X_train)
    X_test_rfe = select.transform(X_test)

    score = LogisticRegression().fit(X_train_rfe,y_train).score(X_test_rfe,y_test)
    print("Score: {:.3f}".format(score))

    mask = select.get_support()
    plt.matshow(mask.reshape(1,-1),cmap='gray_r')
    plt.xlabel("Sample index")
    plt.show()
 def feature_selection(self) -> None:
     """
     Features selection
     """
     #######################
     # FEATURE SELECTION
     # selector = SelectPercentile(score_func=mutual_info_classif, percentile=100)
     # selector = RFE(estimator=LogisticRegression(max_iter=1500), n_features_to_select=15)
     selector = RFE(estimator=GradientBoostingClassifier(),
                    n_features_to_select=15)
     self.log.info(
         f"[FEATURE SELECTION] Feature selection using {type(selector).__qualname__}"
     )
     selector.fit(self.training.X, self.training.y)
     self.training.X = selector.transform(self.training.X)
     self.log.debug(
         f"[FEATURE SELECTION] Feature index after {type(selector).__qualname__}: {selector.get_support(indices=True)}"
     )
     self.test.X = selector.transform(self.test.X)
     self.log.debug(
         f"[FEATURE SELECTION] Train shape after feature selection: {self.training.X.shape} | {self.training.y.shape}"
     )
     self.log.debug(
         f"[FEATURE SELECTION] Test shape after feature selection: {self.test.X.shape} | {self.test.y.shape}"
     )
Ejemplo n.º 3
0
def Feature_Selection_Recursive(k, function, model, xtrain, xtest, ytrain,
                                ytest):

    selector = RFE(function, k)
    selector = selector.fit(xtrain, ytrain)

    xtrain = selector.transform(xtrain)
    xtest = selector.transform(xtest)
    clf = model

    clf.fit(xtrain, ytrain)
    log_detial = {
        'Model': '',
        'Select-Method': '',
        'Select-Function': '',
        'Feature-Count': 0,
        'Train-S': 0,
        'Test-S': 0,
        'R2': 0,
        'RMSE': 0,
        'AE': 0
    }
    log_detial['Model'] = str(clf.__class__).split('.')[-1].replace("'>", '')
    log_detial['Select-Method'] = 'Recursive'
    log_detial['Select-Function'] = str(
        function.__class__).split('.')[-1].replace("'>", '')
    log_detial['Feature-Count'] = k
    log_detial['Train-S'] = clf.score(xtrain, ytrain)
    log_detial['Test-S'] = clf.score(xtest, ytest)
    log_detial['R2'] = r2_score(ytest, clf.predict(xtest))
    log_detial['RMSE'] = sqrt(mean_squared_error(ytest, clf.predict(xtest)))
    log_detial['AE'] = mean_absolute_error(ytest, clf.predict(xtest))
    return (log_detial)
Ejemplo n.º 4
0
def determine_num_feat_for_selection(X_train, Y_train, max_num_features, alpha):
  nums = numpy.arange(5, max_num_features + 1, 5)
  scores = []

  kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=RAND)
  r = linear_model.Ridge(alpha=alpha)

  for n in nums:
    fold_mses = []
    for tr_index, val_index in kf.split(X_train, Y_train):
      X_tr = X_train[tr_index]
      X_val = X_train[val_index]
      Y_tr = Y_train[tr_index]
      Y_val = Y_train[val_index]

      selection = RFE(r, n, step=1).fit(X_tr, Y_tr)
      X_tr = selection.transform(X_tr)
      X_val = selection.transform(X_val)

      r.fit(X_tr, Y_tr)
      Y_pred = r.predict(X_val)
      mse = metrics.mean_squared_error(Y_val, Y_pred)
      fold_mses = numpy.append(fold_mses, mse)

    scores = numpy.append(scores, fold_mses.mean())

  min_score_index = numpy.argmin(scores)
  print('Selecting', nums[min_score_index], 'features')
  return nums[min_score_index]
Ejemplo n.º 5
0
def feature_selection():
    print("Start training...")

    svc = svm.LinearSVC(C=0.01, penalty='l1', dual=False)
    svc.fit(train_vec, train_label.ravel())

    tree = ExtraTreesClassifier()
    tree.fit(train_vec, train_label.ravel())

    print("Training Accuracy:%.4f" % svc.score(train_vec, train_label))
    print("Testing Accuracy:%.4f" % svc.score(test_vec, test_label))

    model = SelectFromModel(svc, prefit=True)

    rfe = RFE(svc, n_features_to_select=20, )
    rfe.fit(train_vec, train_label.ravel())

    X_train = rfe.transform(train_vec)
    X_test = rfe.transform(test_vec)

    print(X_train.shape)

    clf = svm.SVC(C=0.9, kernel='rbf', gamma=80, decision_function_shape='ovo', )
    clf.fit(X_train, train_label.ravel())
    print("After feature selection...")
    print("Training Accuracy:%.4f" % clf.score(X_train, train_label))
    print("Testing Accuracy:%.4f" % clf.score(X_test, test_label))
    return X_train, X_test
Ejemplo n.º 6
0
def in46():
    from sklearn.feature_selection import RFE
    from sklearn.ensemble import RandomForestClassifier
    select=RFE(RandomForestClassifier(n_estimators=100,random_state=42),n_features_to_select=40)

    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    cancer = load_breast_cancer()
    rng = np.random.RandomState(42)
    noise = rng.normal(size=(len(cancer.data), 50))
    #   print(cancer.data.shape) (596,30)
    x_w_noise = np.hstack([cancer.data, noise])
    x_train, x_test, y_train, y_test = train_test_split(x_w_noise, cancer.target, random_state=0, test_size=0.5)

    select.fit(x_train,y_train)

    mask = select.get_support()
    plt.matshow(mask.reshape(1, -1), cmap='gray_r')
    plt.xlabel('sample index')
    plt.show()

    x_train_rfe=select.transform(x_train)
    x_test_rfe=select.transform(x_test)

    from sklearn.linear_model import LogisticRegression
    print(LogisticRegression().fit(x_train, y_train).score(x_test, y_test))
    print(LogisticRegression().fit(x_train_rfe, y_train).score(x_test_rfe, y_test))
Ejemplo n.º 7
0
def lsvm_rfe(c, n_feat, trainX, trainy, testX):
    svc = SVC(kernel="linear", C=c)
    rfe = RFE(estimator=svc, n_features_to_select=n_feat, step=1)
    rfe.fit(trainX, trainy)
    train_X = rfe.transform(trainX)
    test_X = rfe.transform(testX)
    return train_X, test_X
def Feature_Optimization_RF(X_train, y_train, X_test, y_test):
    results = pd.DataFrame(
        columns=['Number of Features', 'Accuracy Score', 'Micro F1 Score', 'Macro F1 Score', 'Weighted F1 Score',
                 'Micro Precision Score', 'Macro Precision Score',
                 'Weighted Precision Score', 'Micro Recall Score', 'Macro Recall Score', 'Weighted Recall Score'])

    for index in np.arange(len(X_train.columns)):
        sel = RFE(RandomForestClassifier(random_state=42, n_jobs=-1), n_features_to_select=index + 1)
        sel.fit(X_train, y_train)
        x_train_rfe = sel.transform(X_train)
        x_test_rfe = sel.transform(X_test)
        model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
        model.fit(x_train_rfe, y_train)
        results.loc[index] = [index + 1,
                              round(accuracy_score(y_test, model.predict(x_test_rfe)), 4),
                              round(f1_score(y_test, model.predict(x_test_rfe), average='micro'), 4),
                              round(f1_score(y_test, model.predict(x_test_rfe), average='macro'), 4),
                              round(f1_score(y_test, model.predict(x_test_rfe), average='weighted'), 4),
                              round(precision_score(y_test, model.predict(x_test_rfe), average='micro'), 4),
                              round(precision_score(y_test, model.predict(x_test_rfe), average='macro'), 4),
                              round(precision_score(y_test, model.predict(x_test_rfe), average='weighted'), 4),
                              round(recall_score(y_test, model.predict(x_test_rfe), average='micro'), 4),
                              round(recall_score(y_test, model.predict(x_test_rfe), average='macro'), 4),
                              round(recall_score(y_test, model.predict(x_test_rfe), average='weighted'), 4)]
        return results
Ejemplo n.º 9
0
def test_rfe():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    X_sparse = sparse.csr_matrix(X)
    y = iris.target

    # dense model
    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    X_r = rfe.transform(X)
    clf.fit(X_r, y)
    assert len(rfe.ranking_) == X.shape[1]

    # sparse model
    clf_sparse = SVC(kernel="linear")
    rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1)
    rfe_sparse.fit(X_sparse, y)
    X_r_sparse = rfe_sparse.transform(X_sparse)

    assert X_r.shape == iris.data.shape
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data))
    assert rfe.score(X, y) == clf.score(iris.data, iris.target)
    assert_array_almost_equal(X_r, X_r_sparse.toarray())
Ejemplo n.º 10
0
class LinearRegSoccerGame:
    # create classifier for home and away
    # x - features list per game
    # y - 0 1 2 == home draw away
    def __init__(self, x, y):
        clf = LinearRegression()
        self.rfe = RFE(estimator=clf, n_features_to_select=102)
        self.rfe.fit(x, y)
        best_features = self.rfe.transform(x)
        self.clf = LinearRegression()
       # self.clf = ElasticNet(normalize=params['normalize'], fit_intercept=params['fit_intercept'], alpha =params['alpha'],
        #                 selection=params['selection'], l1_ratio=params['l1_ratio'], random_state=42).fit(x,y)
        self.X = best_features
        self.Y = y

    # return the probabilities per game in test group
    # x - test group
    def take_the_more_prob(self, x):
        bet = [round(k) for k in self.clf.predict(self.rfe.transform(x))]

        return

    def calculate_prob_for_test_group(self, x):
        home = []
        draw = []
        away = []
        for k in self.clf.predict(self.rfe.transform(x)):
            if k == 0.0:
                home.append(1)
                draw.append(0)
                away.append(0)
            elif k == 1.0:
                home.append(0)
                draw.append(1)
                away.append(0)
            elif k == 2.0:
                home.append(0)
                draw.append(0)
                away.append(1)
            else:
                h = 1 / k
                d = 1 / abs(1 - k)
                a = 1 / abs(2 - k)
                norm_ = h + d + a
                home.append(h / norm_)
                draw.append(d / norm_)
                away.append(a / norm_)
        return home, draw, away
    def predict_proba(self, x):
        return [self.calculate_prob_for_test_group(x)]
    def fit(self, x, y):
        return self
    # return in how much games the result with more probability is equal to the real result
    # 1 return parameter: number of games
    # 2 return parameter: number of the probability right
    # 3 return parameter:  the two above
    """def take_the_more_prob(self, x, cost_per_game, real_results):
Ejemplo n.º 11
0
def RFE_Feature(x_train,y_train,x_test,y_test):
  from sklearn.feature_selection import RFE
  from sklearn.ensemble import RandomForestClassifier
  # define model
  rfc = RandomForestClassifier(n_estimators=100)
  rfe = RFE(estimator=rfc, n_features_to_select=3)
  # fit the model
  rfe.fit(x_train, y_train)
  #transform the data
  x_train, y_train = rfe.transform(x_train, y_train)
  x_test, y_test = rfe.transform(x_test, y_test)
  return x_train,y_train,x_test,y_test
Ejemplo n.º 12
0
def subtest(model, XL, YL, XT, YT, feature_names):
	nfeatures = XL.shape[1]
	rfe = RFE(model, nfeatures-1)
	print "BEFORE"
	model.fit(XL, YL)
	print_performance(YT, model.predict(XT))
	print "AFTER"
	rfe.fit(XL, YL)
	print_performance(YT, rfe.predict(XT))
	print "REMOVED FEATURE %s" % (feature_names[np.where(rfe.support_==False)[0][0]])
	print ""
	return rfe.transform(XL), rfe.transform(XT), feature_names[rfe.support_]
Ejemplo n.º 13
0
def rfe(X, y, x_test):
    selector = RFE(Lasso(), n_features_to_select=30, step=1)
    selector = selector.fit(X, y)
    X_cols = X.columns
    x_test_cols = x_test.columns
    selector.transform(X)
    selector.transform(x_test)
    return pd.DataFrame(X, columns=X_cols), pd.DataFrame(x_test,
                                                         columns=x_test_cols)


# def feature_union(X,x_test=None,verbose=False):
#     return
Ejemplo n.º 14
0
def RFE_method(X_train, X_test, y_train, y_test):
    #def RFE_method(X_train, y_train):
    # define model
    rfc = RandomForestClassifier(n_estimators=100)
    rfe = RFE(estimator=rfc, n_features_to_select=2)
    # fit the model
    rfe.fit(X_train, y_train)
    # transform the data
    #X_train, y_train = rfe.transform(X_train, y_train)
    #X_test, y_test = rfe.transform(X_test, y_test)
    X_train = rfe.transform(X_train)
    X_test = rfe.transform(X_test)
    #return X_train, y_train, X_test, y_test
    return X_train, X_test
Ejemplo n.º 15
0
 def select_features_RFE_lasso(X, y, columns, iteration):
     clf = LassoCV(max_iter=iteration).fit(X, y)
     importance = np.abs(clf.coef_)
     print("importance")
     print(importance)
     idx_third = importance.argsort()[-4]
     threshold = importance[idx_third] + 0.00000001
     idx_features = (-importance).argsort()[:8]
     name_features = np.array(columns)[idx_features]
     print('Selected features RFE from LassoCV : {}'.format(name_features))
     sfm = RFE(clf)
     sfm.fit(X, y)
     X_transform = sfm.transform(X)
     n_features = sfm.transform(X).shape[1]
     return name_features
Ejemplo n.º 16
0
def feature_selection(func=DecisionTreeClassifier):
    x, y = load_data()
    clf = func()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

    # rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='accuracy')
    rfe = RFE(estimator=clf, step=1, n_features_to_select=5)
    rfe = rfe.fit(x_train, y_train)
    print('Chosen best 5 feature by rfe:', x_train.columns[rfe.support_])
    feat_name = list(x_train.columns[rfe.support_])
    x_train = rfe.transform(x_train)
    x_test = rfe.transform(x_test)
    data_set = np.hstack((x_train, np.array([[i] for i in y_train])))
    test_set = np.hstack((x_test, np.array([[i] for i in y_test])))
    return data_set.tolist(), test_set.tolist(), feat_name
Ejemplo n.º 17
0
def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFE(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)
    transformed_test_data = rfe.transform(test_data)
    
    return transformed_train_data,transformed_test_data 
Ejemplo n.º 18
0
def run_once(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    # build model
    lg_regression = linear_model.LogisticRegression(solver='lbfgs')
    rfe = RFE(lg_regression, best_nof_feature)
    rfe_train_x = rfe.fit_transform(train_x, train_y)
    rfe_test_x = rfe.transform(test_x)
    lg_regression.fit(rfe_train_x, train_y)
    labels = df_label.unique()
    # predict probs
    test_y_predict_probs = lg_regression.predict_proba(rfe_test_x)
    test_y_predict_prob = test_y_predict_probs[:, 1]
    prob_df = pd.DataFrame(test_y_predict_prob)
    prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0)
    get_accuracy("logistic regression predict_probs", test_y,
                 prob_df['predict'], labels)
    # print features
    cols = list(df_ohe.columns)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index
    save_print("Top " + str(best_nof_feature) + " features are: ")
    save_print(selected_features_rfe)
    # dump model
    joblib.dump(lg_regression, root_folder + "lg_regression.pkl")
    save_print("lg_regression Model dumped!")
    joblib.dump(selected_features_rfe, root_folder + "lg_regression_cols.pkl")
    save_print("lg_regression models columns dumped!")
Ejemplo n.º 19
0
 def rfe(self, n):
     rfe = RFE(self.clf, n)
     logger.info("Fitting RFE to data...")
     fit = rfe.fit(self.X, self.y)
     logger.info(f"RFE support: {fit.support_}")
     logger.info(f"RFE ranking: {fit.ranking_}")
     self.X = rfe.transform(self.X)
Ejemplo n.º 20
0
def run_rfe(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)

    # build model
    nof_list = np.arange(1, (max_feature_try_numbers + 1))
    class_1_precision_list = []
    class_1_recall_list = []
    for n in range(len(nof_list)):
        save_print("********Current nof features are: " + str(nof_list[n]))
        dc_tree = DecisionTreeClassifier(criterion='entropy',
                                         min_samples_split=20,
                                         random_state=99)
        rfe = RFE(dc_tree, nof_list[n])
        rfe_train_x = rfe.fit_transform(train_x, train_y)
        rfe_test_x = rfe.transform(test_x)
        dc_tree.fit(rfe_train_x, train_y)
        labels = df_label.unique()
        # predict
        test_y_predict = dc_tree.predict(rfe_test_x)
        class_1_precision, class_1_recall = get_accuracy(
            "decision tree", test_y, test_y_predict, labels)
        class_1_precision_list.append(class_1_precision)
        class_1_recall_list.append(class_1_recall)
    plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list,
                    'decision tree')
Ejemplo n.º 21
0
    def Recursive_Feature_Elimination(self, X_train, X_test, y_train, y_test, x, y, file_name = 'model.sav'):
        nof_list = np.arange(1, len(x.columns))
        high_score=0
        nof=0
        score_list =[]
        for n in range(len(nof_list)):
            model = LinearRegression()
            rfe = RFE(model, nof_list[n])
            X_train_rfe = rfe.fit_transform(X_train, y_train)
            X_test_rfe = rfe.transform(X_test)
            model.fit(X_train_rfe, y_train)
            score = model.score(X_test_rfe, y_test)
            score_list.append(score)
            if(score>high_score):
                high_score = score
                nof = nof_list[n]

        print("Optimum number of features: %d with score: %f" % (nof, high_score))

        cols = list(x.columns)
        model = LinearRegression()
        rfe = RFE(model, nof)
        X_rfe = rfe.fit_transform(x,y)
        model.fit(X_rfe,y)
        temp = pd.Series(rfe.support_,index = cols)
        selected_features_rfe = temp[temp==True].index
        pickle.dump(model, open(file_name, 'wb'))

        with open('parameters_selection.txt', 'w') as f:
            for item in selected_features_rfe:
                f.write("%s\n" % item)

        return selected_features_rfe
Ejemplo n.º 22
0
class RFE_RandomForestRegPrim(primitive):
    def __init__(self, random_state=0):
        super(RFE_RandomForestRegPrim, self).__init__(name='RFE_RandomForestReg')
        self.id = 44
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Feature ranking with recursive feature elimination with Random-Forest regressor. Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached."
        self.hyperparams_run = {'default': True}
        self.selector = RFE(RandomForestRegressor())
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        mask = self.selector.get_support(indices=False)
        final_cols = list(compress(cols, mask))
        output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        final_output = {0: output}
        return final_output
Ejemplo n.º 23
0
def select_useful_features(X_scaled, y, k_features=k_features):
    auc_scores = np.array([])
    accuracy_scores_RFE = []
    f1_scores_RFE = []
    for i in k_features:
        model = LogisticRegression(random_state=0)
        selector = RFE(model, n_features_to_select=i, 
                         step=True, verbose=True)
        selector.fit(X_scaled, y)
        X_selected = selector.transform(X)
        auc_score = compute_score(model, X_selected, y)
        auc_scores = np.append(auc_scores, auc_score)
        accuracy_score = compute_score_accuracy(model, X_selected, y)
        accuracy_scores_RFE.append(accuracy_score)
        f1_score = compute_score_f1(model, X_selected, y)
        f1_scores_RFE.append(f1_score)
    scores_df = pd.DataFrame(auc_scores, index=k_features,
                             columns=['scores'])
    max_value = scores_df[max(scores_df.values)==scores_df.values].index.values[0]
    plt.plot(string_array, auc_scores, linewidth=1, color='black',
             marker='o', markersize=7, label='AUC')   
    plt.plot(string_array, accuracy_scores_RFE, linewidth=1,
             marker='o', markersize=7, label='accuracy',
             color='red')
    plt.plot(string_array, f1_scores_RFE, linewidth=1,
             marker='o', markersize=7, label='f1')
    plt.axvline(x=10, color='red', linewidth=1,
                linestyle='--', label='chosen with {} features'.format(11))
    plt.ylabel('scoring', fontsize=20)
    plt.xlabel('number of features selected', fontsize=20)
    plt.title('Recursive feature elimination', fontsize=30)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(linewidth=0.5)
    plt.legend(fontsize=14)
Ejemplo n.º 24
0
def run_once(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    # build model
    dc_tree = DecisionTreeClassifier(criterion='entropy',
                                     min_samples_split=20,
                                     random_state=99)
    rfe = RFE(dc_tree, best_nof_feature)
    rfe_train_x = rfe.fit_transform(train_x, train_y)
    rfe_test_x = rfe.transform(test_x)
    dc_tree.fit(rfe_train_x, train_y)
    labels = df_label.unique()
    # predict
    test_y_predict = dc_tree.predict(rfe_test_x)
    get_accuracy("decision tree", test_y, test_y_predict, labels)
    # print features
    cols = list(df_ohe.columns)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index
    save_print("Top " + str(best_nof_feature) + " features are: ")
    save_print(selected_features_rfe)
    # dump model
    joblib.dump(dc_tree, root_folder + "dc_tree.pkl")
    save_print("dc_tree Model dumped!")
    joblib.dump(selected_features_rfe, root_folder + "dc_tree_cols.pkl")
    save_print("dc_tree models columns dumped!")
Ejemplo n.º 25
0
def run_rfe(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)

    # build model
    nof_list = np.arange(1, (max_feature_try_numbers + 1))
    class_1_precision_list = []
    class_1_recall_list = []
    for n in range(len(nof_list)):
        save_print("********Current nof features are: " + str(nof_list[n]))
        lg_regression = linear_model.LogisticRegression(solver='lbfgs')
        rfe = RFE(lg_regression, nof_list[n])
        rfe_train_x = rfe.fit_transform(train_x, train_y)
        rfe_test_x = rfe.transform(test_x)
        lg_regression.fit(rfe_train_x, train_y)
        labels = df_label.unique()
        # predict probs
        test_y_predict_probs = lg_regression.predict_proba(rfe_test_x)
        test_y_predict_prob = test_y_predict_probs[:, 1]
        prob_df = pd.DataFrame(test_y_predict_prob)
        prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0)
        class_1_precision, class_1_recall = get_accuracy(
            "logistic regression predict_probs", test_y, prob_df['predict'],
            labels)
        class_1_precision_list.append(class_1_precision)
        class_1_recall_list.append(class_1_recall)
    plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list,
                    'logistic regression')
Ejemplo n.º 26
0
def RFE():
    from sklearn.feature_selection import RFE
    model = LinearRegression(X, y)
    #Initializing RFE model
    rfe = RFE(model, 5)
    #Transforming data using RFE
    X_rfe = rfe.fit_transform(X, y)
    #Fitting the data to model
    model.fit(X_rfe, y)
    print(rfe.support_)
    print(rfe.ranking_)

    #no of features
    nof_list = np.arange(1, 13)
    high_score = 0
    #Variable to store the optimum features
    nof = 0
    score_list = []
    for n in range(len(nof_list)):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=100)
        model = LinearRegression()
        rfe = RFE(model, nof_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            nof = nof_list[n]
    print("Optimum number of features: %d" % nof)
    print("Score with %d features: %f" % (nof, high_score))
Ejemplo n.º 27
0
def RFE_nof(df, target, normalize):

    y = df[target]
    X = df.drop(target, 1)

    nof_list = np.arange(1, len(X.columns))
    high_score = 0
    #Variable to store the optimum features
    nof = 0
    score_list = []

    for n in range(len(nof_list)):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.3,
                                                            random_state=0)
        model = LinearRegression(copy_X=True,
                                 fit_intercept=True,
                                 n_jobs=None,
                                 normalize=normalize)
        rfe = RFE(model, nof_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            nof = nof_list[n]
    return nof
Ejemplo n.º 28
0
def RFE(X, y, num_features=4, classification_tasks=True, model=None):
    """
    Implements feature selection using Recursive Feature Elimination
    :return:
    """
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression, LinearRegression

    # column values
    col_names = X.columns

    if not model:
        if classification_tasks:
            model = LogisticRegression()
        else:
            model = LinearRegression()

    rfe = RFE(model, n_features_to_select=num_features)
    fit = rfe.fit(X, y)
    features = rfe.transform(X)

    feature_scores = pd.DataFrame(fit.ranking_, index=col_names, columns=["scores"])
    # best features has a #1 score value
    feature_scores = feature_scores.sort_values(by="scores", ascending=True)

    print("============== Feature scores - RFE ===========")
    print("Feature Ranking:\n {}".format(feature_scores))
    print("Selected Features: {}".format(features))

    return features, feature_scores
Ejemplo n.º 29
0
    def fitpositiveLassoCVRFE(self):
        cv = TimeSeriesSplit(n_splits=3)  #3 is the default
        lassomodel = LassoCV(n_alphas=2, alphas=np.linspace(0.01, 0.1, num=2), fit_intercept=False, precompute=True, \
                max_iter=2000, cv=cv,\
                positive=True, random_state=9999, selection='random')  #alpha=0.01 #alphas=np.linspace(0.01, 0.1, num=10),

        rfe = RFE(lassomodel, globe.SecuritiesPerBasket)
        fit = rfe.fit(self.xdata, self.ydata)
        print(fit)
        #print("Num Features: " + str(fit.n_features_))
        #print("Selected Features: " + str(fit.support_))
        #print("Feature Ranking: " + str(fit.ranking_))
        i = 0
        indices = list()
        for included in fit.support_:
            if included:
                indices.append(i)
            i += 1
        print('Indeces selected by RFE:')
        print(indices)
        lmodel = rfe.estimator_
        X = rfe.transform(self.xdata)
        port = subportfolio.subportfolio(lmodel, X, indices, self.ydata, 'positiveLassoCVRFE', 'RFE', self.plotmodelresults, \
            self.plt, self.catdata)
        port.evaluatemodelaccuracy(self.figurenr, self.listofsubportfolios)
Ejemplo n.º 30
0
def optimal_number_of_features(X_train, y_train, X_test, y_test):
    '''
    optimal_number_of_features(X_train, y_train, X_test, y_test)
    RETURNS: number_of_features
    
    discover the optimal number of features, n, using our scaled x and y dataframes, recursive feature
    elimination and linear regression (to test the performance with each number of features).
    We will use the output of this function (the number of features) as input to the next function
    optimal_features, which will then run recursive feature elimination to find the n best features

    Shamelessly stolen from David Espinola
    '''

    number_of_attributes = X_train.shape[1]
    number_of_features_list = np.arange(1, number_of_attributes)
    high_score = 0

    #Variable to store the optimum features
    number_of_features = 0
    score_list = []

    for n in range(len(number_of_features_list)):
        model = LinearRegression()
        rfe = RFE(model, number_of_features_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            number_of_features = number_of_features_list[n]
    return number_of_features
Ejemplo n.º 31
0
def optimal_number_of_features(X, y):
    '''discover the optimal number of features, n, using our scaled x and y dataframes, recursive feature
    elimination and linear regression (to test the performance with each number of features).
    We will use the output of this function (the number of features) as input to the next function
    optimal_features, which will then run recursive feature elimination to find the n best features
    '''
    number_of_attributes = X_train.shape[1]
    number_of_features_list = np.arange(
        1, number_of_attributes)  # len(features_range)

    # set "high score" to be the lowest possible score
    high_score = 0

    # variables to store the feature list and number of features
    number_of_features = 0
    score_list = []

    for n in range(len(number_of_features_list)):
        model = LinearRegression()
        rfe = RFE(model, number_of_features_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            number_of_features = number_of_features_list[n]
    return number_of_features
Ejemplo n.º 32
0
class BackwardStepwise(object):
    def __init__(self, n, estimator, step=100):
        assert type(
            n
        ) is int and n > 0, "Invalid parameter type or value %s (number of features)" % n
        assert type(
            step
        ) is int and step > 0, "Invalid parameter type or value %s (step)" % n

        self.__estimator = estimator
        self.__n = n
        self.__step = step
        self.__model = RFE(self.__estimator,
                           n_features_to_select=self.__n,
                           step=self.__step)

    def score_features(self, X, Y):
        self.__model.fit(X, Y)
        return self.__model.ranking_

    def select_features(self, X):
        return self.__model.transform(X)

    def __str__(self):
        return '''
        Backward stepwise feature selection:
            Top features selected: %s
            Step size: %s
            Estimator: %s
        ''' % (self.__n, self.__step, self.__estimator)
Ejemplo n.º 33
0
class LogReg:

  """
  Initialization sets the objects model, vectorizer, labels, and corpus
  variables. Initialization also performs the initial training for the model
  and vectorizer using the given reviews.
  """
  def __init__(
      self,
      reviews,
      vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 1,
        ngram_range = (1, 2)),
      model = LogisticRegression()
      ):
    self.model = model
    self.vectorizer = vectorizer
    self.selector = RFE(self.model, step = 100, verbose = 100)

    corpus = []
    labels = []
    for review in reviews:
      corpus += [review[1]["text"]]
      labels += [review[0]]

    #setting variables for the object
    self.corpus = corpus
    self.labels = labels
    self.reviews = reviews

    X = self.vectorizer.fit_transform(self.corpus)
    self.feature_names = self.vectorizer.get_feature_names()
    y = self.labels
    for string in self.feature_names:
      print(string.encode("ascii", 'ignore'))

    #Training the model
    X_new = self.selector.fit_transform(X, self.labels)
    self.model.fit(X_new, self.labels)

  def classify_all(self, all_test_data):
    test_corpus = []
    y = []
    for review in all_test_data:
      test_corpus += [review[1]['text']]
      y += [review[0]]

    #Used transform instead of fit_transform
    #for test data so number of features will match
    X = self.vectorizer.transform(test_corpus)
    X_new = self.selector.transform(X)
    results = self.model.predict(X_new)
    categories = ["spring", "summer", "fall", "winter"]
    for i, category in enumerate(categories):
      top10 = np.argsort(self.model.coef_[i])[-20:]
      for j in top10:
        print("%s: %s" % (category, "".join(self.feature_names[j])))
    return results
Ejemplo n.º 34
0
  def vocabulary(self, all_test_data):
    test_corpus = []
    y = []
    for review in all_test_data:
      test_corpus += [review[1]['text']]
      y += [review[0]]

    X = self.vectorizer.transform(test_corpus)
    results = self.model.predict(X)
    selector = RFE(self.model, 100, 1)
    sel_result = selector.fit(X, y)
    print(selector.transform(X))
Ejemplo n.º 35
0
def featureSelector(train_path,predict_path):
    X, y = load_svmlight_file("data/comment_test")
    Xt, yt = load_svmlight_file("data/test_predict")
    estimator = SVR(kernel="linear")
    selector = RFE(estimator, step=1)
    newX = selector.fit(X, y)
    newtX=selector.transform(Xt)
    print newtX

    """
    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    newX=sel.fit_transform(X)
    print newX
    """
#featureSelector("","")
Ejemplo n.º 36
0
import numpy as np
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV, RFE
from sklearn.datasets import make_classification
from sklearn.metrics import zero_one
from scipy import sparse

# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
                           n_features=5000,
                           n_informative=3,
                           n_redundant=2,
                           n_repeated=0,
                           n_classes=8,
                           n_clusters_per_class=1,
                           random_state=0)

X_sparse = sparse.csr_matrix(X)
print X.shape, "x", y.shape

# sparse model
clf_sparse = SVC(kernel="linear")
rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.20)
rfe_sparse.fit(X_sparse, y)
X_r_sparse = rfe_sparse.transform(X_sparse)

print X_r_sparse.shape
'''


#selector = SelectKBest(f_classif, k=18)
#print(selector.get_support)
S = SVC(kernel='linear')

# create the RFE model for the svm classifier 
# and select attributes
rfe = RFE(S,30)
rfe = rfe.fit(first, y)
# print summaries for the selection of attributes
print(rfe.support_)
print(rfe.ranking_)
print(rfe.n_features_)
yoo = rfe.transform(first)
#yoo = ["A3","B3","B4","B6","C2","C6","C7","C8","C11","C16c","C17","D1","D2","D3","D5","D6","D7","D8","D9","D10","D11","E4","E3","E5"]

#print(rfe.score(first,y))


'''
rfecv = RFECV(estimator=S, step=1, cv=StratifiedKFold(2))
rfecv.fit(yoo, y)
print(rfecv.grid_scores_) 
'''
'''

clf = DecisionTreeClassifier()
clf.fit(first, y)
sfm = SelectFromModel(clf,threshold=0.022)
def featureReductionTest(useFeature,trueSet,falseSet,dim=10,state=-1):
    if(state==-1):
        state = np.random.randint(10000)

    # load data and split
    X_true = []
    for dn in trueSet:
        fin = open("./learn/data/"+useFeature+"_"+dn+".pkl","rb")
        X_true.append(pickle.load(fin))
        fin.close()
    X_true = np.vstack(X_true)
#    print(X_true.shape)

    X_false = []
    for dn in falseSet:
        fin = open("./learn/data/"+useFeature+"_"+dn+".pkl","rb")
        X_false.append(pickle.load(fin))
        fin.close()
    X_false = np.vstack(X_false)
#    print(X_false.shape)

    test_size = 0.3
    X_true_train,X_true_test = train_test_split(X_true ,test_size=test_size,random_state=state)
    X_false_train, X_false_test = train_test_split(X_false ,train_size=len(X_true_train),test_size=len(X_true_test),random_state=state+1)

    X = np.vstack([X_true_train,X_false_train])
    X_ = np.vstack([X_true_test,X_false_test])
    Y = [1]*len(X_true_train)+[0]*len(X_false_train)
    Y_ = [1]*len(X_true_test)+[0]*len(X_false_test)
    X,Y = shuffle(X,Y)
    X_,Y_ = shuffle(X_,Y_)

    featNames = ml_feature_name.getFeatureName(useFeature)

    clf = LinearSVC(C=0.1)
    rfe = RFE(estimator =clf, n_features_to_select=dim,step=10)
    rfe.fit(X,Y)
    Xs = rfe.transform(X)
    Xs_ = rfe.transform(X_)
    clf.fit(Xs,Y)
    Yp = clf.predict(Xs)
    Yp_ = clf.predict(Xs_)

    supIndex = rfe.transform(list(range(len(X[0]))))[0]
    feats = [[abs(clf.coef_[0][i]),clf.coef_[0][i],v,featNames[v]] for i,v in enumerate(supIndex)]
    feats.sort()
    print("\n".join(list(map(str,feats))[::-1]))
    print(classification_report(Y,Yp))
    print(classification_report(Y_,Yp_))

    featNames = ml_feature_name.getFeatureName(useFeature)
    arr = Xs.T[0]
    reg = list(zip(arr,Y))
    reg.sort()
#    plt.plot(list(range(len(reg))),reg)
#    plt.ylim(0,2)
#    plt.show()


    if(useFeature=="rp"):
        fin = open("./feature/id_rhythm_barkband.txt","r")
        bark = [int(v) for v in fin.readline().split(",")]
        fin.close()

        barkName = []
        for ind in range(len(bark)):
            if(ind==0):
                barkName.append("0-"+str(bark[ind])+"Hz")
            else:
                barkName.append(str(bark[ind-1])+"-"+str(bark[ind])+"Hz")

        flucName = ["{0:.0f}bpm".format(((v)+1)*0.17*60) for v in range(60)]
        barkName.reverse()
        mat = np.zeros((60,24))
        for i,ind in enumerate(supIndex):
            val = clf.coef_[0][i]
            mat[ind//24,ind%24]=val
        mat =np.fliplr(mat)
        plt.yticks(range(24),barkName)
        plt.xticks(range(60),flucName,rotation="vertical")
        plt.imshow(mat.T,cmap="Greys_r")
        plt.savefig("./learn/feature/rp_rank"+str(dim)+".png")
        plt.show()
    return f1_score(Y,Yp),f1_score(Y_,Yp_)
Ejemplo n.º 39
0
print('Variance score Train: %.2f' % selector2.score(X,y));
print('Variance score Test: %.2f' % selector2.score(Xtest,ytest));
print('Coeff of Test: ', selector2.ranking_);
print('No of Features selected by RFE = %.2f' %sum(selector2.support_));

plotfit(selector2,X,y, title = 'Training fit');
plotfit(selector2,Xtest,ytest,c='blue', title = 'Test fit');


# Forecast, 
# Since lagged variables are selected for our model, forecasting is done iteratively 
# by using the predicted values at time t as lagged variables for time t+1,t+2...
# In practice however, this is not required as the true price will be known before prediction.
yfcast = [];
for i in  list(range(len(Xfcast))):
    Xfcast_trim = selector2.transform(Xfcast);
    x = Xfcast_trim[i,:];
    x = x.reshape(1,-1);
    ypred = predictor.predict(x);
    yfcast.append(ypred);
    k = 9;j=1;
    if i < len(Xfcast)-1:
        for l in list(range(0,lag)): 
            #print ('l = ', l , 'i+j = ', i+j, ' k+1 = ', k+1);
            if i+j <= 49:
                Xfcast[i+j,k] = ypred;
            k = k+10;
            j = j+1;
#np.savetxt('../Xfcast-loaded.csv', Xfcast, fmt="%f", delimiter = ',');            

# Write predictions
Ejemplo n.º 40
0
def svc_1():
    """
    Submission: svc_1_0620_01.csv
    E_val: 0.866856950449
    E_in: 0.855948
    E_out: 0.8546898189645258
    """
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import LinearSVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFE
    from sklearn.grid_search import RandomizedSearchCV
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.linear_model import LogisticRegression
    from scipy.stats import expon

    logger.debug('svc_1')

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = RFE(estimator=LogisticRegression(class_weight='auto'), step=1,
              n_features_to_select=21)
    rfe.fit(X_scaled, y)
    util.dump(rfe, util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    logger.debug('Features selected.')

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    svc = LinearSVC(dual=False, class_weight='auto')
    rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1,
                            cv=StratifiedKFold(y, 5),
                            param_distributions={'C': expon()})
    rs.fit(X_new, y)

    logger.debug('Got best SVC.')
    logger.debug('Grid scores: %s', rs.grid_scores_)
    logger.debug('Best score (E_val): %s', rs.best_score_)
    logger.debug('Best params: %s', rs.best_params_)

    svc = rs.best_estimator_
    util.dump(svc, util.cache_path('new_data.SVC'))

    isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5),
                                      method='isotonic')
    isotonic.fit(X_new, y)
    util.dump(isotonic,
              util.cache_path('new_data.CalibratedClassifierCV.isotonic'))

    logger.debug('Got best isotonic CalibratedClassifier.')
    logger.debug('E_in (isotonic): %f', auc_score(isotonic, X_new, y))

    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('svc', isotonic)]), 'svc_1_0620_01')
pre= machine_1[['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W',]]
clf= ExtraTreesClassifier().fit(pre,machine_1['X'])
clf.feature_importances_
model= SelectFromModel(clf, prefit=True)
new=model.transform(pre)
new.shape()



# Recursive Feature Elimination
from sklearn.feature_selection import RFE
model = ExtraTreesClassifier()
rfe = RFE(model)
rfe = rfe.fit(pre,machine_1['X'])
# summarize the selection of the attributes
machine_model= rfe.transform(pre)
print(rfe.support_)
print(rfe.ranking_)
#after comparing output of two models, its been concluded that Recursive Feature Elimination gives better results

machine_2= pd.read_csv("G:\\Datasets\\7z assignment\\Train\\machine2.csv")
machine_2.isnull().sum()
predictors_2= machine_2[['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W',]]
clf2= ExtraTreesClassifier().fit(predictors_2,machine_2['X'])
clf2.feature_importances_
model_2= SelectFromModel(clf2, prefit=True)
new_model2=model_2.transform(predictors_2)


df_machine1= pd.DataFrame(new)
df_machine1.loc[:,'X']= pd.Series(machine_1['X'],index=df_machine1.index)
Ejemplo n.º 42
0
def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args):
    W = []
    features = []

    if selection_method != '2step_kbest':
        n_features = min(n_features, len(feature_list))

    if estimator_method == 'svm' and selection_method == 'rfe':
        estimator_args['kernel'] = 'linear'

    estimator = ESTIMATORS[estimator_method](**estimator_args)

    if selection_method == 'cluster':
        agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average')
        clusters = agglom.fit_predict(X).tolist()
        sample = [clusters.index(i) for i in range(n_features)]
        X = X[:,sample]
        Z = Z[:,sample]
        selection_method = None

    if selection_method is None:
        for i, y in enumerate(Y):
            estimator.fit(X, y)
            w = estimator.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'rfe':
        selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector = selector.fit(X, y)
            features.append(feature_list[selector.support_])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'myrfe':
        selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args)

        for i, y in enumerate(Y):
            selector.fit(X, y)
            features.append(feature_list[selector.support])
            w = selector.predict(Z)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    if selection_method == 'kbest':
        selector = SelectKBest(f_regression, k=n_features, **selection_args)
        for i, y in enumerate(Y):
            X2 = selector.fit_transform(X, y)
            Z2 = selector.transform(Z)
            features.append(feature_list[selector.get_support()])
            estimator.fit(X2, y)
            w = estimator.predict(Z2)
            W.append(w)
            if (i+1) % (len(Y) / 10) == 0:
                print '.',

    print

    return W, features
Ejemplo n.º 43
0
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVR

model = LogisticRegression()
selector = RFE(model, 12, step=1)
selector.fit(X,y)

# summarize the selection of the features
print X.columns[selector.get_support()]

#get the only selected features from X
X_new=selector.transform(X)
X_new = pd.DataFrame(X_new,columns = [X.columns[selector.get_support()]])

# 5-folder cross validation
y_pred=cross_val_predict(model,X_new,y, cv=5)

#print precision_score(y,y_pred,average=None)
#print recall_score(y,y_pred,average=None)
#print f1_score(y,y_pred,average=None)
#print accuracy_score(y,y_pred)
#print classification_report(y,y_pred)

#######################################################
# def full_precision (estimator, X_test, y_test):
#     y_pred = estimator.predict(X_test)
#     return precision_score(y_test,y_pred, average=None)