Ejemplo n.º 1
0
def select_features(features, X_train_all, y_train_all):
    learningrates = [0.05, 0.1, 0.5, 1]
    valid_score = pd.DataFrame(index = ['accuracy'], columns = features)
    learning_rates = pd.DataFrame(index = ['accuracy'], columns = learningrates)
    valid_score[:] = 0
    learning_rates[:] = 0
    
    for j in reversed(range(1,51,1)):
        print(j)
        X_train = X_train_all[:-j,:]
        X_valid = X_train_all[-j,:]

        y_train = y_train_all[:-j]
        y_valid = y_train_all[-j]
        
        for f in features:
            for lr in learningrates:
                estimator = AdaBoostClassifier(learning_rate = lr, random_state=random_state)
                selector = RFE(estimator, f, step=1)
                selector = selector.fit(X_train, y_train)
                valid_score.loc['accuracy',f] += 1/(90*len(learningrates)) * selector.score(X_valid.reshape(1, -1), y_valid.flatten())
                learning_rates.loc['accuracy',lr] += 1/(90*len(features)) * selector.score(X_valid.reshape(1,-1), y_valid.flatten())
                print("number {}, feature number {}, learning rate{}, accuracy {}".format(j, f, lr, selector.score(X_valid.reshape(1, -1), y_valid.flatten())))
    print(float(learning_rates.idxmax(axis=1)))
    print(int(valid_score.idxmax(axis=1)))
    estimator_select = AdaBoostClassifier(learning_rate = lr, random_state=random_state)
    selector_select = RFE(estimator_select, int(valid_score.idxmax(axis=1)), step=1)
    selector_select_fit = selector_select.fit(X_train_all, y_train_all)
    #valid_error = 1 - valid_score.max(axis=1)
    #print('validation error is:', valid_error)
    return selector_select_fit, float(learning_rates.idxmax(axis=1))
Ejemplo n.º 2
0
def feature_selection(X, Y, estimator):
	selector = RFE(estimator)
	choosen = selector
	score  =  0
	for i in range(1,7):
		selector = RFE(estimator,i)
		selector = selector.fit(X, Y)
		if selector.score(X,Y) > score:
			choosen = selector
			score = selector.score(X, Y)
	print choosen.ranking_
	return choosen.transform(X)
def get_best_features_Nums(X_train, y_train, originNum):
    feature_num = 0
    best_acc = 0
    print(originNum)
    for i in range(originNum):
        clf = LinearSVC()
        model = RFE(clf, n_features_to_select=i + 1)
        model.fit(X_train, y_train)
        if model.score(X_train, y_train) > best_acc:
            best_acc = model.score(X_train, y_train)
            feature_num = i + 1
            print(best_acc)
    return feature_num
    def __init__(self, datafile, n_range, estimators=100, nsteps=5, ftest=True, mutual=True,
                 rfe=True, seed=1, loss_='deviance', testsize=0.15):
        self.data = pd.read_csv(datafile)
        self.x, y = self.data.iloc[:, :-1], self.data.iloc[:, -1]
        self.feature_names=list(self.x.columns)
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, y, test_size=.15, random_state=0)

        self.acc = {'f_classif':[], 'mutual':[], 'rfe':[]}
        for n in n_range:
            np.random.seed(seed)
            #########################################################################
            # Train and test on GradientBoostingClassifier with f_classif
            #########################################################################

            if ftest:
                Ftest = SelectKBest(score_func=f_classif, k=n)
                boost_clf = Pipeline((("Feature_select", Ftest),
                                    ("Classify", GradientBoostingClassifier(loss=loss_, n_estimators=estimators))))

                self.y_pred_f = boost_clf.fit(self.x_train, self.y_train).predict(self.x_test)
                self.acc['f_classif'].append(accuracy_score(self.y_test, self.y_pred_f))
                mask_Ftest  = Ftest.get_support()

            #########################################################################
            # Train and test on GradientBoostingClassifier with mutual_info_classif
            #########################################################################

            if mutual:
                mutual = SelectKBest(score_func=mutual_info_classif, k=n)
                boost_m_clf = Pipeline((("Feature_select", mutual),
                                    ("Classify", GradientBoostingClassifier(loss=loss_, n_estimators=estimators))))

                self.y_pred_m = boost_m_clf.fit(self.x_train, self.y_train).predict(self.x_test)
                self.acc['mutual'].append(accuracy_score(self.y_test, self.y_pred_m))
                mask_mutual = mutual.get_support()

            #########################################################################
            # Recursive Feature Elimination with GradientBoostingClassifier
            #########################################################################

            if rfe:
                rfe = RFE(estimator=GradientBoostingClassifier(loss=loss_, n_estimators=estimators), step=nsteps, n_features_to_select=n)
                self.selector = rfe.fit(self.x_train, self.y_train)
                self.y_pred_r = self.selector.predict(self.x_test)

                self.acc['rfe'].append(rfe.score(self.x_test, self.y_test))
                self.r2 = r2_score(self.y_test, self.y_pred_r)
                mask_rfe    = self.selector.support_

            #########################################################################
            # Test for correlation between features
            #########################################################################

            if ftest:
                self.Ftest_features  = [feature for bool, feature in zip(mask_Ftest, self.feature_names) if bool]
            if mutual:
                self.mutual_features = [feature for bool, feature in zip(mask_mutual, self.feature_names) if bool]
            if rfe:
                self.rfe_features = [feature for bool, feature in zip(mask_rfe, self.feature_names) if bool]
                self.features_rank = self.selector.ranking_
Ejemplo n.º 5
0
def test_rfe():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    X_sparse = sparse.csr_matrix(X)
    y = iris.target

    # dense model
    clf = SVC(kernel="linear")
    rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
    rfe.fit(X, y)
    X_r = rfe.transform(X)
    clf.fit(X_r, y)
    assert len(rfe.ranking_) == X.shape[1]

    # sparse model
    clf_sparse = SVC(kernel="linear")
    rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1)
    rfe_sparse.fit(X_sparse, y)
    X_r_sparse = rfe_sparse.transform(X_sparse)

    assert X_r.shape == iris.data.shape
    assert_array_almost_equal(X_r[:10], iris.data[:10])

    assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data))
    assert rfe.score(X, y) == clf.score(iris.data, iris.target)
    assert_array_almost_equal(X_r, X_r_sparse.toarray())
Ejemplo n.º 6
0
def main(params, inputs, outputs):

    ### 读入输入变量和目标变量 ###
    x = pd.read_pickle(inputs.x)
    y = pd.read_pickle(inputs.y)

    ### 读入参数 ###
    step = params.step
    n_features = params.n_features

    ### 定义RFE使用的算法 ###
    estimator = RandomForestClassifier(n_estimators=20,
                                       criterion='gini',
                                       class_weight='balanced',
                                       n_jobs=-1)
    ### 使用RFE进行训练 ###
    rfe = RFE(estimator, step=step, n_features_to_select=n_features)
    rfe.fit(x, y)

    ### 训练准确率 ###
    score = rfe.score(x, y)

    ### 生成新dataframe ###
    df_rfe = pd.DataFrame(index=x.columns,
                          data=rfe.support_,
                          columns=['support'])
    rfe_columns = list(df_rfe[df_rfe.support == True].index)
    x_new = x[rfe_columns]
    y_new = y.copy()

    ### 输出 ###
    x_new.to_pickle(outputs.x_new)
    y_new.to_pickle(outputs.y_new)
Ejemplo n.º 7
0
def Ftr_elm(X, y):
    ''' feature elimination using RFE'''
    adj_R2 = []
    feature_set = []
    max_adj_R2_so_far = 0
    n = len(X)
    k = len(X[0])
    selected_ranking=[]
    for i in range(1,k+1):
        selector = RFE(LogisticRegression(), i,verbose=1)
        selector = selector.fit(X, y)
        current_R2 = selector.score(X,y)
        current_adj_R2 = 1-(n-1)*(1-current_R2)/(n-i-1) 
        adj_R2.append(current_adj_R2)
        feature_set.append(selector.support_)
        if max_adj_R2_so_far < current_adj_R2:
            max_adj_R2_so_far = current_adj_R2
            selected_features = selector.support_
            #selected_ranking= selector.ranking_
            selected_ranking.append(selector.ranking_)
        print('End of iteration no. {}'.format(i))
        print('selector support is :', selector.support_)
        #print('selected ranking is ;', selector.ranking_)
        print('selected ranking is ;', selected_ranking)      
    X_sub = X[:,selected_features]    
    return (adj_R2, selector.support_, selector.ranking_, X_sub )
Ejemplo n.º 8
0
    def get_score_and_features(self, n_features):
        features = list(self.df.columns.values)
        features.remove('Label')
        
        df_X = self.df[features]
        df_Y = self.df['Label']
        estimator = XGBClassifier(random_state=0)
        x_train, x_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.33,
                                                        random_state = 0)
        
        selector = RFE(estimator, n_features, step=1)
        selector = selector.fit(x_train, y_train)
        
        features_bool = np.array(selector.support_)
        features = np.array(df_X.columns)
        list_feat = list(features[features_bool])
#        list_to_keep = []
#        for i in range(len(list(selector.support_))):
#        	if list(selector.support_)[i] :
#        		list_to_keep.append(i)
#        list_feat = list(df_X.columns[list_to_keep])
#        list_delete = list(set(list(df_X.columns.values))-set(df_X.columns[list_to_keep]))
#        x_test = x_test.drop(list_delete  ,axis=1)
        
#        clf_xgb = XGBClassifier(random_state=0)
#        clf_xgb = clf_xgb.fit(x_train , y_train)
  
        score = selector.score(x_test,y_test)
#        score = compute_accuracy_score(x_test, y_test)
        self.selected_features = list_feat
        self.list_score = score
        return self.selected_features, self.list_score
Ejemplo n.º 9
0
def demo():
    digits = load_digits()
    X = digits.images.reshape((len(digits.images), -1))
    y = digits.target

    # Create the RFE object and rank each pixel
    svc = SVC(kernel="linear", C=1)
    rfe = RFE(estimator=svc, n_features_to_select=None, step=1)
    rfe.fit(X, y)
    ranking = rfe.ranking_.reshape(digits.images[0].shape)

    print rfe.score(X, y)
    # Plot pixel ranking
    plt.matshow(ranking, cmap=plt.cm.Blues)
    plt.colorbar()
    plt.title("Ranking of pixels with RFE")
    plt.show()
Ejemplo n.º 10
0
def sonar_wrapper():
    sonar_data = load_sonar_data()
    sonar_values, sonar_labels = data_preprocessing(sonar_data)
    estimator = SGDClassifier(max_iter=1000)
    selector = RFE(estimator,5)
    selector.fit(sonar_values, sonar_labels)
    score, f1score = selector.score(sonar_values, sonar_labels), f1_score(selector.predict(sonar_values), sonar_labels)
    print('Sonar-wrapper -accuracy of TOP 5 features = %.4f, F1 score = %.4f' % (score,f1score))
Ejemplo n.º 11
0
def wbc_wrapper():
    wbc_data = load_wbc_data()
    wbc_values, wbc_labels = data_preprocessing(wbc_data)
    estimator = SGDClassifier(max_iter=1000)
    selector = RFE(estimator,5)
    selector.fit(wbc_values,wbc_labels)
    score, f1score = selector.score(wbc_values,wbc_labels), f1_score(selector.predict(wbc_values), wbc_labels)
    print('WBC-wrapper -accuracy of TOP 5 features = %.4f, F1 score = %.4f' % (score,f1score))
Ejemplo n.º 12
0
def elimination_feature():
    df = _load_data()
    X_train, X_test, y_train, y_test = _train_test(df, 'Milk')
    linear = LinearRegression()
    rfe = RFE(linear, n_features_to_select=3)
    rfe.fit(X_train, y_train)
    y_predict = rfe.predict(X_test)
    score = rfe.score(X_test, y_test)
    err = mean_squared_error(y_test, y_predict)
    return score, err, y_predict
Ejemplo n.º 13
0
def rfe_selection(X, y, model):

    # Realiza la selección de características por medio de un eliminado recursivo de características..

    # Parámetros:
    #       X (DataFrame): DataFrame con todas las variables predictoras.
    #       y (Series): Objeto Series de pandas con la variable endógena.
    #       model: Cualquier modelo de sklearn que se vaya a utilizar como referencia para la selección.

    # Devuelve:
    #       Nada.

    rfe = RFE(model, 12, step=1, verbose=1)
    rfe = rfe.fit(X, y)
    rfe.score(X, y)
    X = X * rfe.support_
    X = X.loc[:, (X != 0).any(axis=0)]
    X = pd.concat([X, y], axis=1, sort=False)
    print(X.columns)
    show_correlation(X)
Ejemplo n.º 14
0
def findRFE():
    labels = []
    acc = []
    filteredFeat = []

    for i in range(6):
        model = sklearn.linear_model.LogisticRegression()

        rfe = RFE(model, i + 1)
        rfe = rfe.fit(xtr, ytr)

        print("\n", "rfe", i + 1)
        print(rfe.support_)
        labels.append(rfe.support_)
        print(rfe.score(xte, yte))
        acc.append(rfe.score(xte, yte))

        # prob = rfe.predict_proba(xte)
        # loss1 = log_loss(yte, prob)
        #
        # print("Loss is ", loss1, "\n")

    labels = np.asarray(labels)
    acc = np.asarray(acc)
    bestacc = np.argmax(acc)

    bestLabel = labels[bestacc]

    if bestLabel[0]:
        filteredFeat.append('Person A')
    if bestLabel[1]:
        filteredFeat.append('Person B')
    if bestLabel[2]:
        filteredFeat.append('Years of Knowing')
    if bestLabel[3]:
        filteredFeat.append('Interaction Duration')
    if bestLabel[4]:
        filteredFeat.append('Interaction Type')
    if bestLabel[5]:
        filteredFeat.append('Moon Phase During Interaction')
    return filteredFeat
def feature_selection(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold):

	#kernel: linear, poly, rbf, sigmoid, precomputed

	# for whatever reason, I cannot directly use the parameters that are passed in to run in the feature selection functions
	# because of this, the next several lines are essentially redefining the parameters and storing them in another variable name
	rows = 0
	while rows_temp > 0:
		rows = rows + 1
		rows_temp = rows_temp - 1

	columns = 0
	while columns_temp > 0:
		columns = columns + 1
		columns_temp = columns_temp - 1

	features_values = [x for x in features_values_temp]
	prediction_values = [y for y in prediction_values_temp]
	# end of defining parameters

	rotated = convert_list_to_matrix(features_values, rows, columns)
	scores = np.array(prediction_values)
	threshold = float(threshold)

	estimator = SVR(kernel=kernel)

	#Running binary search to help find the correct features that meet the specified threshold.
	lower_bound = 0
	upper_bound = columns

	while(upper_bound - lower_bound > 1):
		current_selector = (lower_bound + upper_bound)/2
		selector = RFE(estimator, current_selector, step=1)
		selector = selector.fit(rotated, scores)
		if selector.score(rotated, scores) > threshold:
			upper_bound = current_selector
		else:
			lower_bound = current_selector

	 print "second threshold: "
	 print selector.score(rotated, scores)
Ejemplo n.º 16
0
def linrfe():
    """
    为了快速计算完成, step=xx 需要设置大一些.

    ridge : 0.28+
    ridge + RFE: 0.28+
    线上却有0.045 ; 线下的这个测试看来完全不准确
    """
    X, y = load_svmlight_file('train.txt')
    X = X.toarray()
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    reg = linear_model.Ridge(alpha=0.5)
    reg.fit(X, y)
    print 'r^2=', reg.score(X, y)
    print 'train mse = ', mean_squared_error(y, reg.predict(X))

    rfe = RFE(estimator=reg, n_features_to_select=500, step=1000, verbose=2)
    rfe.fit(X, y)
    print 'rfe r^2 = ', rfe.score(X, y)
    print 'rfe mse =', mean_squared_error(y, rfe.predict(X))

    X_rfe = rfe.transform(X)
    poly = PolynomialFeatures(degree=2, interaction_only=True)
    X_poly = poly.fit_transform(X_rfe)  #直接处理会有 MemoryError

    param_grid = {'alpha': [0.5, 1, 10, 100, 1000, 1e4, 3e4]}
    gbm = GridSearchCV(reg,
                       param_grid,
                       verbose=2,
                       scoring='neg_mean_squared_error',
                       cv=5)
    gbm.fit(X_poly, y)
    logging.info('after rfe poly, best_result = {0}'.format(gbm.best_score_))
    logging.info('after rfe poly, best_param= {0}'.format(gbm.best_params_))
    #mse =  reg.score(X_poly, y)
    #print 'after poly ' ,mean_squared_error(y, reg.predict(X_poly))
    #logging.info('rfe r^2 score= ' + str(mse) )

    params = {
        'objective': 'mse',
        'num_leaves': 8,
        'learning_rate': 0.05,
        'min_child_samples': 60,  # 这个题目比较关键 .
        # 'subsample': 0.9,
        'n_estimators': 100,
        'silent': False,
    }
    gbm = lgb.LGBMRegressor(**params)
    gbm.fit(X_poly, y, eval_metric='mse', eval_set=[(X_poly, y)])

    logging.info('train lgb of poly = {0}'.format(
        mean_squared_error(y, gbm.predict(X_poly, y))))
Ejemplo n.º 17
0
def choice_feature_nums(data_x, data_y, col_name):
    n = len(col_name)
    dic = {}
    for i in range(3, n + 1):
        rfe = RFE(estimator=LinearRegression, n_features_to_select=i)
        rfe.fit_transform(data_x, data_y)
        dic[i] = rfe.score()
    plt.xlabel('feature_num')
    plt.ylabel('score')
    plt.plot(dic.keys(), dic.values())
    plt.show()
    return dic
Ejemplo n.º 18
0
def rfe_feature_select(count, features, X, y, model_type="RF", model=RandomForestClassifier(random_state=42),
                       v=0, scores=pd.DataFrame(columns=['model_type', 'model','params', 'n_features',
                                                         'score', 'features','ranking', 'weighted_score'])):
    
    start = time.time()
    helicopter = [count, round(count*0.8), round(count*0.6), round(count*0.4), round(count*0.2)]
    weightings = [0.98, 0.985, 0.99, 0.995, 1]
    
    if v>0:
        print(model)
        
    score = pd.DataFrame()
    
    for n in range(len(helicopter)):
        eliminator = RFE(estimator=model, n_features_to_select=helicopter[n], step=1, verbose=v)
        eliminator.fit(X, y)
        feats = pd.Series(eliminator.support_, index=features)
        weight  = 100*eliminator.score(X, y)
        params = model.get_params()
        
        score = score.append({'model_type' : model_type,
                                'model' : model,
                                'params': params,
                                'n_features' : eliminator.n_features_,
                                'score' : round(score, 2),
                                'features' : list(feats[feats==True].index),
                                'ranking' : list(eliminator.ranking_),
                                'weighted_score' : round(weightings[n]*weight, 2)},
                               ignore_index=True)
    if v>0:
        print("%d features took %d seconds, or %d minutes" % (count, round(time.time()-start), round((time.time()-start)/60)))
        print("Tested feature combinations: %s" % helicopter)
        print(score)
    
    df = score.sort_values(by='weighted_score', ascending=False)
    
    print("Selected feature combination: %d" % df.iloc[0].n_features)
    
    if helicopter[0]-helicopter[1]==1: # We've reached the end
        return scores
    elif df.iloc[0].n_features==count: # The highest feature count is the best so try -1 features
        tmp_features = df.iloc[0].features
        tmp_count  = len(tmp_features)-1
        
        scores = rfe_feature_select(tmp_count, tmp_features[0:tmp_count], X_train[tmp_features[0:tmp_count]],
                                y_train, df.iloc[0].model_type, df.iloc[0].model, v, scores)
    else:
        scores = rfe_feature_select(df.iloc[0].n_features, df.iloc[0].features, X_train[df.iloc[0].features],
                                y_train, df.iloc[0].model_type, df.iloc[0].model, v, scores)
    return scores
Ejemplo n.º 19
0
    def reg_with_rfe(self):
        print("regression after RFE", "\n")

        reg = LinearRegression()
        selector = RFE(reg)
        selector.fit(self.x_train, self.y_train)

        train_score = selector.score(self.x_train, self.y_train)
        test_score = selector.score(self.x_test, self.y_test)
        coeff_used = selector.n_features_

        print("training score:", train_score)
        print("test score: ", test_score)
        print("number of features used: ", coeff_used)

        #將使用到的變數印出其係數
        # for var in selector.estimator_.coef_:
        print("Selected variables:")
        for var, coef in zip(self.x_train.columns, selector.estimator_.coef_):
            if (coef == 0):
                continue
            print(str(var) + ":", coef)
        print("-------------------------------------------------------")
Ejemplo n.º 20
0
def rfe_scores(clf, X_train, y_train, X_test, y_test, n_nodes=None):

    if n_nodes is None:
        n_nodes = [1, 4, 6, 10, 15, 22]

    clfs = []
    for n in n_nodes:
        rclf = RFE(clf, n_features_to_select=n)
        rclf.fit(X_train, y_train)

        print("RFE {} selected nodes {}".format(n, rclf.get_support(True)))
        print("RFE {} score is {}".format(n, rclf.score(X_test, y_test)))
        clfs.append(rclf)

    return clfs
Ejemplo n.º 21
0
def get_best_attr_with_score(model, x_test, y_test, attr_col, k):
    #     X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
    #     estimator = SVR(kernel="linear")
    selector = RFE(model, k, step=1)
    #     print("________________1")
    selector = selector.fit(x_train, y_train)
    #     selector.support_
    #     print("________________2")

    selector.ranking_
    our_selected_attr = selector.get_support()
    list_attr_se = []
    score = selector.score(x_test, y_test)
    for i, j in zip(our_selected_attr, attr_col):
        if (i == True):
            list_attr_se.append(j)
    return score, list_attr_se, our_selected_attr
Ejemplo n.º 22
0
def q4():

    X = df.copy().drop(columns=["Overall"])
    y = df["Overall"]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        shuffle=True)

    reg = LinearRegression()
    reg.fit(X_train, y_train)

    print("Model r2 score Linear regression:", reg.score(X_test, y_test))
    y_pred = reg.predict(X_test)
    print('MSE', mse(y_test, y_pred))
    print('RMSE', mse(y_test, y_pred, squared=False))
    print(
        pd.DataFrame.from_dict(dict(zip(X_train.columns, reg.coef_)),
                               orient='index',
                               columns=['coef']).sort_values(
                                   by='coef', ascending=False).head(5))

    selector = RFE(estimator=reg, n_features_to_select=5, step=1, verbose=0)
    selector = selector.fit(X_train, y_train)
    selected_features5 = list(X_train.columns[selector.get_support()])
    print('\nMost important features RFE', selected_features5)
    print("\nModel r2 score RFE Linear regression selected features:",
          selector.score(X_test, y_test))
    y_pred = selector.predict(X_test)
    print('MSE', mse(y_test, y_pred))
    print('RMSE', mse(y_test, y_pred, squared=False))

    X_train5 = selector.transform(X_train)
    reg.fit(X_train5, y_train)
    coeficients = reg.coef_
    print(
        pd.DataFrame.from_dict(dict(zip(selected_features5, reg.coef_)),
                               orient='index',
                               columns=['coef']).sort_values(
                                   by='coef', ascending=False).head(5))

    # plt.scatter(y_test,y_pred)
    # plt.show()
    return selected_features5
Ejemplo n.º 23
0
def selected_feature_from_random_forrest(X_train, X_test, y_train, y_test):
    # Select 10 features with RFE on a RandomForestRegressor, drop 3 features on each step
    rfe_rf = RFE(estimator=RandomForestRegressor(),
                 n_features_to_select=10,
                 step=3,
                 verbose=1)
    rfe_rf.fit(X_train, y_train)

    # Calculate the R squared on the test set
    r_squared = rfe_rf.score(X_test, y_test)
    print(
        'The model can explain {0:.1%} of the variance in the test set'.format(
            r_squared))

    # Assign the support array to gb_mask
    rf_mask = rfe_rf.support_

    return rf_mask
Ejemplo n.º 24
0
def selected_feature_by_rfe(X_train, X_test, y_train, y_test):
    # Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
    rfe_gb = RFE(estimator=GradientBoostingRegressor(),
                 n_features_to_select=10,
                 step=3,
                 verbose=1)
    rfe_gb.fit(X_train, y_train)

    # Calculate the R squared on the test set
    r_squared = rfe_gb.score(X_test, y_test)
    print(
        'The model can explain {0:.1%} of the variance in the test set'.format(
            r_squared))

    # Assign the support array to gb_mask
    gb_mask = rfe_gb.support_

    return gb_mask
Ejemplo n.º 25
0
def RFE_SVM(x, y, numFeats):
    np.random.seed(7)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

    clf = SVC(kernel='linear', probability=True, max_iter=200)
    selector = RFE(estimator=clf, n_features_to_select=numFeats, step=5)

    s_time = time.clock()

    selector.fit(x_train, y_train.iloc[:, 0])

    acc = selector.score(x_test, y_test.iloc[:, 0])

    e_time = time.clock()
    print('\n Total Time: ', e_time - s_time)
    print(' Accuracy:', acc)
    return selector, acc
Ejemplo n.º 26
0
 def rfe(self, X_train, y_train, X_test, y_test, n_features = 300, step = 0.2, kernel = "linear"):
     """
     - Recursive Feature Elimination - step < 1 is a percentage. Returns selected features.
     - Hyperparameter tuning was done in a different notebook.
     """
     # Create estimator and selector.
     estimator = SVR(kernel=kernel, C=0.01, gamma=1e-07)
     selector = RFE(estimator, n_features_to_select = n_features, step=step)
     selector = selector.fit(X_train.to_numpy(), y_train.to_numpy())
     
     # Print accuracy.
     print('Accuracy of RFE: {:.3f}'.format(selector.score(X_test, y_test)))
     
     # Create dictionary with results.
     selected_features = X_train.columns[selector.support_].tolist()
     feature_importances = [1 for x in range(len(selected_features))]
     dictionary = {"Recursive Feature Elimination":[selected_features, feature_importances]}
     
     return dictionary
Ejemplo n.º 27
0
def RFE_AdaBoost(x, y, numFeats):
    np.random.seed(7)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

    clf = AdaBoostClassifier(base_estimator=None,
                             n_estimators=50,
                             learning_rate=1.0)
    selector = RFE(estimator=clf, n_features_to_select=numFeats, step=5)

    s_time = time.clock()

    selector.fit(x_train, y_train.iloc[:, 0])

    acc = selector.score(x_test, y_test.iloc[:, 0])

    e_time = time.clock()
    print('\n Total Time: ', e_time - s_time)
    print(' Accuracy:', acc)
    return selector, acc
Ejemplo n.º 28
0
def svm(train_set, label_set, test_set, ground_truth):
    train_set = Normalizer().fit_transform(train_set)
    test_set = Normalizer().fit_transform(test_set)
    svm_clf = SVC(C=0.2, kernel='linear')
    #svm_clf = SVC()
    #s = cross_validate(svm_clf,train_set,label_set)
    #print(s)
    #grid = GridSearchCV(svm_clf,param_grid={"C":[0.2,0.5,1.0,1.2,1.5,3,10],"kernel":['linear','rbf']},cv=10)
    #grid.fit(train_set,label_set)
    rfe = RFE(estimator=svm_clf, n_features_to_select=2, step=1)
    # n=5,0.6497 n=8,0.66358  n=12,0.66728  n=15,0.66635
    """"[True  True False  True False  True  True False  True  True  True  True
     True  True False  True]
    [1 1 5 1 2 1 1 3 1 1 1 1 1 1 4 1]"""

    rfe.fit(train_set, label_set)

    #print(rfe.support_)
    #print(rfe.ranking_)
    #svm_clf.fit(train_set,label_set)
    #y_score = svm_clf.decision_function(test_set)
    #y = svm_clf.predict(test_set)
    y = rfe.predict(test_set)
    #fpr, tpr, threshold = roc_curve(test_set, y_score)
    #roc_auc = auc(fpr, tpr)
    #y = grid.predict(test_set)
    print(rfe.score(test_set, ground_truth))

    #print(svm_clf.score(test_set,ground_truth))
    #print(svm_clf.score(train_set,label_set))
    #print(grid.score(test_set,ground_truth))
    #print(grid.score(train_set,label_set))
    #print(grid.best_params_)

    p = precision_score(y, ground_truth)
    r = recall_score(y, ground_truth)
    f = f1_score(y, ground_truth)

    return p, r, f
Ejemplo n.º 29
0
gnb = GaussianNB()

parameters = {'kernel':('linear', 'rbf'), 'C':[0.001, 1, 100], 'gamma': [0.001, 0.0001]}
svr = svm.SVC()
clf = GridSearchCV(estimator=svr, param_grid=parameters, cv=2)

parameters = {'n_neighbors':[4, 12, 20], 'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute')}
kvr = KNeighborsClassifier()
knn = GridSearchCV(estimator=kvr, param_grid=parameters)


svmselector = svm.SVC(kernel='linear')
selector = RFE(svmselector, 10, step = 1)
selector = selector.fit(xTrain, yTrain)
print "\tTraining accuracy: " + str(selector.score(xTrain, yTrain) * 100)
print "\tTesting accuracy: " + str(selector.score(xTest, yTest) * 100)
 

chosen = selector.support_
print chosen

for idx, val in enumerate(chosen.tolist()):
    if str(val) == 'True':
         print idx

print selector.ranking_
# Training
gnb.fit(xTrain, yTrain)
clf.fit(xTrain, yTrain)
knn.fit(xTrain, yTrain)
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y.reshape(len(y),1)).reshape(len(y))


#Feature Elimination
from sklearn.linear_model import LinearRegression 
from sklearn.feature_selection import RFE
adj_R2 = []
feature_set = []
max_adj_R2_so_far = 0
n = len(X)
k = len(X[0])
for i in range(1,k+1):
    selector = RFE(LinearRegression(), i,verbose=1)
    selector = selector.fit(X, y)
    current_R2 = selector.score(X,y)
    current_adj_R2 = 1-(n-1)*(1-current_R2)/(n-i-1) 
    adj_R2.append(current_adj_R2)
    feature_set.append(selector.support_)
    if max_adj_R2_so_far < current_adj_R2:
        max_adj_R2_so_far = current_adj_R2
        selected_features = selector.support_
    print('End of iteration no. {}'.format(i))
        
X_sub = X[:,selected_features]


#split into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sub,y,random_state=0)
Ejemplo n.º 31
0
    data_values = np.loadtxt("train.csv", delimiter=',', usecols=range(15,16))
    print "Number of training samples: " + str(data_features.shape[0])

    # Transform the training data
    #scaler = MinMaxScaler(feature_range=(1, 2)).fit(data_features)
    #data_features = scaler.transform(data_features)
    data_features = transform_data(data_features)

    # Fit the regression model
    alphas = np.logspace(-5,3,30)
    ridge_regressor = linear_model.RidgeCV(alphas=alphas, normalize=True, fit_intercept=True)
    rfe = RFE(estimator=ridge_regressor, n_features_to_select=14, step=1)
    rfe.fit(data_features[0:600], data_values[0:600])
    print "alpha: " + str(rfe.estimator_.alpha_)
    print "intercept: " + str(rfe.estimator_.intercept_)
    print "R-square: " + str(rfe.score(data_features[600:], data_values[600:]))

    # Compute MSE
    train_pred = rfe.predict(data_features)
    mse = ((train_pred[600:] - data_values[600:])**2).mean(axis=0)
    print "MSE: " + str(mse)

    # Visualize predicted values on train data
    plt.plot(train_pred[600:], label='Predicted')
    plt.plot(data_values[600:], label='Actual')
    plt.legend(loc='upper right')
    plt.show()

    # Load the test data
    validate_test = np.loadtxt("validate_and_test.csv", delimiter=',', usecols = range(1,15))
    validate_test_ind = np.loadtxt("validate_and_test.csv", delimiter=',', usecols = range(0,1))
X = [residual[:-1] for residual in data ]    
n_split = 1800    
X_train, X_test = X[:n_split], X[n_split:]
Y_train, Y_test = y[:n_split], y[n_split:]



numFeatures = 40  
    
model = ExtraTreesClassifier()

#model.fit(X_train, Y_train)
rfe = RFE(model, numFeatures)
rfe = rfe.fit(X_train,Y_train)

temp = rfe.score(X_test, Y_test)
predictionOfPrelim = rfe.predict(prelimData)

featureRanking = rfe.ranking_
#Best ExtraTrees Accuracy is:  [400, 0.98902777777777773, 40]            
print("ExtraTrees Accuracy is: ", temp)

prelimClasses = np.loadtxt("prelim-class.txt")
assert len(prelimClasses) == len(predictionOfPrelim)
h = []
for i in range(len(prelimClasses)):
    if prelimClasses[i] == predictionOfPrelim[i]:
        h.append(1)
    else:
        h.append(0)
Ejemplo n.º 33
0
selector1 = RFECV(rf,cv=5);
selector1 = selector1.fit(X,y);
print("Features selected by RFECV ", selector1.n_features_);

# RFECV yields a model with low number (4~5) of features with K-fold cross validation with 5 folds.
# Since our sample size is small, additional features are selected to avoid  over-fitting on the model data.
# 8 Features were selected using RFE.

selector2 = RFE(rf,8); # Recursive feature elimination to select 8 best features. 
selector2 = selector2.fit(X,y);
print (selector2.n_features_)
for i,j in enumerate(selector2.support_):
    if j == True:
        print(features[i])
predictor = selector2.estimator_;
print('Variance score Train: %.2f' % selector2.score(X,y));
print('Variance score Test: %.2f' % selector2.score(Xtest,ytest));
print('Coeff of Test: ', selector2.ranking_);
print('No of Features selected by RFE = %.2f' %sum(selector2.support_));

plotfit(selector2,X,y, title = 'Training fit');
plotfit(selector2,Xtest,ytest,c='blue', title = 'Test fit');


# Forecast, 
# Since lagged variables are selected for our model, forecasting is done iteratively 
# by using the predicted values at time t as lagged variables for time t+1,t+2...
# In practice however, this is not required as the true price will be known before prediction.
yfcast = [];
for i in  list(range(len(Xfcast))):
    Xfcast_trim = selector2.transform(Xfcast);
from sklearn.metrics import mean_squared_error
features = X.columns.values
results = []
selector = RFE(estimator,3,step=1)   #recursive forward elimination 
selcetor = selector.fit(X,y)
selector.support_
# Out[61]: array([ True,  True,  True, False, False, False])
# It seems the first predictors are okay but not the the last three. 
# one can see if droping the last three will improve the model with all 6.
selector.ranking_
# Out[65]: array([2, 1, 3, 6, 5, 4])

for i in range(1,len(X.iloc[0])+1):
    selector = RFE(estimator, n_features_to_select=i, step=1)
    selector.fit(X,y)
    r2 = selector.score(X,y)
    selected_features = features[selector.support_]
    msr = mean_squared_error(y, selector.predict(X))
    results.append([i, r2, msr, ','.join(selected_features)])
    
results 

'''
results 
Out[68]: 
[[1, 0.47017552557905884, 2.5448877365932985, 'Email'],
 [2, 0.8987844810699489, 0.48616503259788457, 'Internet,Email'],
 [3, 0.9008606156394956, 0.47619280658599406, 'Internet,Email,Blog'],
 [4,
  0.9051564044419049,
  0.45555899148299284,
Ejemplo n.º 35
0
# very similar idea to foreward selection but done recurssively. This method is gready
# which means it tries one feature at the time
NUM_FEATURES = 16
# this is kind of arbitrary but the idea should come by observing the scatter plots and correlation.
model = LinearRegression()
rfe = RFE(model, NUM_FEATURES)
champsFit = rfe.fit(champsX, champsY)
print("For Champions:")
print("Num Features:", champsFit.n_features_)
print("Selected Features:", champsFit.support_)
print("Feature Ranking:", champsFit.ranking_)

runnersFit = rfe.fit(runnersX, runnersY)
print("For Runner Ups:")
print("Num Features:", runnersFit.n_features_)
print("Selected Features:", runnersFit.support_)
print("Feature Ranking:", runnersFit.ranking_)

# calculate the score for the selected features
champsScore = rfe.score(champsX, champsY)
runnersScore = rfe.score(runnersX, runnersY)
print("Model Champs Score with selected features is: ", champsScore)
print("Model Runer up Score with selected features is: ", runnersScore)
"""
Results:

Run in terminal to see results. Not very good because these 2 datasets
are more of a classification model. Linear Regression is not very good here.

"""
        # If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team
        bet_vector = np.array(np.where(predicted_spreads > spreads,0,1))

        # Create the actual result vector where a tie counts as a loss for the home team
        game_result = np.array(np.where(home_score.ix[:,0] + predicted_spreads[:] > away_score.ix[:,0], 1, 0))

        # Check to see where the bet_vector equals the actual game result with the spread included
        result = np.array(np.where(bet_vector == game_result,1,0))

        prob_result = float(np.sum(result)) / len(result)

        # print 'Number of features =', feat, 'C =',c,'  Percent correct =',prob_result

        if prob_result > prob_val:
            prob_val = prob_result
            C_val = c
            feat_val = feat

print 'Score =',selector.score(X_test,y_test)
# print prob_val, C_val, feat

clf = linear_model.LogisticRegression(C=C_val,random_state=42)
clf = clf.fit(X_train,y_train)
probabilities = clf.predict_proba(scaler.transform(matchups))
vfunc = np.vectorize(spread_conversion)
predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0])
bet_vector = np.array(np.where(predicted_spreads > spreads,0,1))
print spreads
print predicted_spreads
print bet_vector