def main():
    data = pd.read_csv("./data/processed_data.csv", index_col=0)
    target = pd.read_csv("./data/processed_target_data.csv", index_col=0)

    data_clean = data.drop(columns=['doc_key'])
    data_clean.fillna(0, inplace=True)

    target_clean = target.drop(columns=['doc_key'])
    target_clean.fillna(0, inplace=True)

    # Data Extraction
    X = data_clean.drop(columns=['estimate'])
    y = data_clean['estimate']

    bin_labels_2 = ['S', 'L']
    bin_labels_3 = ['S', 'M', 'L']
    bin_labels_4 = ['S', 'M', 'L', 'XL']
    bin_labels_5 = ['XS', 'S', 'M', 'L', 'XL']

    bin_labels = bin_labels_4
    y_bins = pd.qcut(y, q=len(bin_labels), labels=bin_labels)
    raw_bins = pd.qcut(y, q=len(bin_labels))

    X = drop_highly_correlated_features(X)

    from sklearn.model_selection import LeaveOneOut, ShuffleSplit, StratifiedShuffleSplit

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y_bins,
                                                        test_size=0.05)

    model_cv = LeaveOneOut()
    model = MultinomialNB()
    selector = RFECV(estimator=model,
                     min_features_to_select=10,
                     step=1,
                     cv=model_cv,
                     scoring='accuracy')
    selector = selector.fit(X_train, y_train)
    y_pred = selector.predict(X_test)

    print("=== Results with Multinomial Naive Bayes ===")
    print('Bins: ', bin_labels)
    print('Bin Ranges: ', raw_bins.dtype.categories.to_tuples().to_numpy())
    print('Counts (training set):')
    print(y_train.value_counts())

    print('\nTotal number of features (aka Words) in data: %d' %
          (data_clean.shape[1] - 1))
    print('Total number of uncorrelated features (aka Words) in data: %d' %
          X.shape[1])
    print("Number of features (aka Words) in optimal model: %d" %
          selector.n_features_)

    print("\nTest doc: %s" % data.loc[list(X_test.index)[0]]['doc_key'])
    print("Expected vs Predicted Size: {} vs {}".format(
        y_test.iloc[0], y_pred[0]))
    print("Class Probabilities: ", selector.classes_,
          selector.predict_proba(X_test))

    plot_rfecv_selection(selector)
Ejemplo n.º 2
0
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, train_test_split
from xgboost import XGBClassifier
np.seterr(divide='ignore', invalid='ignore')
df = pd.read_excel(r'Data/Null check.xlsx')

#clf_feature_selection = XGBClassifier(colsample_bytree= 0.1, gamma= 0.1, learning_rate= 0.01,   max_depth= 20, min_child_weight= 1, n_estimators= 20)
#clf_feature_selection = LogisticRegression()
#clf_feature_selection = XGBClassifier()

clf_feature_selection = RandomForestClassifier(bootstrap= False, criterion= 'entropy', max_depth= 20, max_features= 'auto', min_samples_leaf= 5, min_samples_split= 5, n_estimators= 300)
rfecv = RFECV(estimator=clf_feature_selection,
              step=1,
              cv = StratifiedKFold(2),
              scoring='roc_auc')

X = df.drop(columns = 'label')
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)


rfecv.fit(X_train,y_train)
#rfecv.fit(X,y)
print("Best Features:", X_train.columns[rfecv.support_])
print("Optimal number of features : %d" % rfecv.n_features_)

predicted_probas = rfecv.predict_proba(X_test)
y_true = y_test
y_probas = predicted_probas
skplt.metrics.plot_roc_curve(y_true, y_probas)
plt.show()
Ejemplo n.º 3
0
    1, 10, 100, 500, 800, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 10000
]
K_scores = []
log_Loss = []
for K in K_range:
    #    log_reg = LogisticRegression(solver='lbfgs', multi_class='multinomial'
    clf = linear_model.LogisticRegression(solver='lbfgs',
                                          multi_class='multinomial',
                                          C=K)
    rfecv = RFECV(clf, step=10, scoring='accuracy')
    rfecv.fit(X_trainf, y_train)
    train_predictions = rfecv.predict(X_valf)
    acc = accuracy_score(y_test, train_predictions)
    K_scores.append(acc)

    train_predictions = rfecv.predict_proba(X_valf)
    ll = log_loss(y_test, train_predictions)
    log_Loss.append(ll)

plt.figure(1)
plt.subplot(211)
plt.plot(K_range, K_scores)
plt.ylim([0.97, 1])
#plt.xlabel('Value of C')
plt.ylabel('Accuracy')

plt.subplot(212)
plt.plot(K_range, log_Loss)
plt.xlabel('Value of C')
plt.ylabel('log_loss')
plt.show()
Ejemplo n.º 4
0
plt.figure(figsize=(12, 9))
plt.xlabel('Number of features tested x 2')
plt.ylabel('Cross-validation score (AUC)')
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.savefig('Porto-RFECV-01.png', dpi=150)
plt.show()

# Save sorted feature rankings.

# In[ ]:

ranking = pd.DataFrame({'Features': all_features})
ranking['Rank'] = np.asarray(rfecv.ranking_)
ranking.sort_values('Rank', inplace=True)
ranking.to_csv('Porto-RFECV-ranking-01.csv', index=False)

# Make a prediction. This is only a proof-of-principle as the prediction will likely be poor until more optimal parameters are used above.

# In[ ]:

score = round((np.max(rfecv.grid_scores_) * 2 - 1), 5)
test['target'] = rfecv.predict_proba(X_test)[:, 1]
test = test[['id', 'target']]
now = datetime.now()
sub_file = 'submission_5fold-RFECV-RandomForest-01_' + str(score) + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("\n Writing submission file: %s" % sub_file)
test.to_csv(sub_file, index=False)
timer(starttime)
Ejemplo n.º 5
0
# 设置参数
xgb = XGBClassifier(n_estimators=300,
                    max_depth=5,
                    nthread=20,
                    scale_pos_weight=4,
                    learning_rate=0.07)
# 特征选择
rfecv = RFECV(estimator=xgb,
              step=10,
              cv=StratifiedKFold(3),
              n_jobs=20,
              scoring='roc_auc')
rfecv.fit(train_x, train_y)

pre_y = rfecv.predict_proba(test_x)[:, 1]
pre_y_categ = rfecv.predict(test_x)
# 计算auc
fpr, tpr, thresholds = metrics.roc_curve(test_y, pre_y)
auc = metrics.auc(fpr, tpr)
f1 = metrics.f1_score(test_y, pre_y_categ)
print("AUC得分为:")
print(auc)
print('f1-score为:')
print(f1)
print("Optimal number of features :")
print(rfecv.ranking_)
print('n_features_')
print(rfecv.n_features_)
print('support_')
print(rfecv.support_)
Ejemplo n.º 6
0
                                            learning_rate=0.02,
                                            nthread=4,
                                            subsample=0.95,
                                            colsample_bytree=0.85,
                                            seed=4242)
                    # bst = xgb.train(param,
                    #                 dtrain,
                    #                 num_round,
                    #                 evallist,
                    #                 early_stopping_rounds=10)   # If error doesn't decrease in n rounds, stop early
                    selector = RFECV(clf, step=1, cv=5)
                    selector = selector.fit(X_train, y_train)
                    print 'Selector fit...'
                    # clf.dump_model('/home/rmendoza/Desktop/xgb_june_04_to_05_v2.txt')
                    # bst.save_model('/home/rmendoza/Desktop/xgbtemp.model')
                    y_pred = selector.predict_proba(test_data)
                    cut = 0.1

                    results = [0, 0, 0, 0, 0, 0, 0]
                    for cutoff in range(10, 15):
                        cut = cutoff/float(100)   # Cutoff in decimal form
                        y = y_pred > cut   # If y values are greater than the cutoff
                        recall = metrics.recall_score(test_label, y)
                        # true_negative_rate = sum(np.logical_not(np.logical_or(test_label, y)))/float(len(y_pred))
                        filter_rate = sum(np.logical_not(y))/float(len(y_pred))
                        if recall*6.7+filter_rate > results[0]:
                            timer = time.time() - start_time
                            results = evalModel(test_label, y_pred, start_time, cut)
                    print results

Ejemplo n.º 7
0
        test[t, nfeatures + 2] = np.std(feature_array)
        test[t, nfeatures + 3] = np.max(feature_array)
        test[t, nfeatures + 4] = np.min(feature_array)
        sort_list = sorted(range(len(feature_array)), key=lambda x: feature_array[x], reverse=True)
        test[t, nfeatures + 5 : nfeatures + 5 + ntwobyte] = sorted(
            range(len(feature_array)), key=lambda x: sort_list[x]
        )

        #        test_fourbyte[t] = map(float, row[nfeatures+1:nfeatures+nfourbyte+1]) if six.PY2 else list(map(float, row[nfeatures+1:nfeatures+nfourbyte+1]))
        Ids.append(row[0])
        if (t + 1) % 1000 == 0:
            print(t + 1, "records loaded")
print("test set loaded")

# Predict for whole test set
# test[:,nfeatures+1:] = pca.transform(test_fourbyte)
# del test_fourbyte
y_pred = rfecv.predict_proba(test)
# y_pred = gscv2.predict_proba(test)
# y_pred = clf2.predict_proba(test)
# Writing results to file
with gzip.open(fsubmission, write_mode) as f:
    fw = writer(f)
    # Header preparation
    header = ["Id"] + ["Prediction" + str(i) for i in range(1, 10)]
    fw.writerow(header)
    for t, (Id, pred) in enumerate(zp(Ids, y_pred.tolist())):
        fw.writerow([Id] + pred)
        if (t + 1) % 1000 == 0:
            print(t + 1, "prediction written")
Ejemplo n.º 8
0
data_train = pd.read_csv('data_analysis/data_train.csv',encoding='gb2312')

targets = data_train['TARGET']
train_data = data_train.drop(labels=['EID','TARGET'],axis=1)

#  划分样本集
train_x,test_x,train_y,test_y = train_test_split(train_data,targets,test_size=0.5,random_state=66)

# 设置参数
xgb = XGBClassifier(n_estimators=300,max_depth=5,nthread=20,scale_pos_weight=4,learning_rate=0.07)
# 特征选择
rfecv = RFECV(estimator=xgb, step=10, cv=StratifiedKFold(3),n_jobs =20,
              scoring='roc_auc')
rfecv.fit(train_x, train_y)

pre_y = rfecv.predict_proba(test_x)[:,1]
pre_y_categ = rfecv.predict(test_x)
# 计算auc
fpr, tpr, thresholds = metrics.roc_curve(test_y, pre_y)
auc=metrics.auc(fpr, tpr)
f1 = metrics.f1_score(test_y,pre_y_categ)
print("AUC得分为:")
print(auc)
print('f1-score为:')
print(f1)
print("Optimal number of features :" )
print(rfecv.ranking_ )
print('n_features_')
print(rfecv.n_features_)
print('support_')
print(rfecv.support_)
Ejemplo n.º 9
0
        test[t, nfeatures + 3] = np.max(feature_array)
        test[t, nfeatures + 4] = np.min(feature_array)
        sort_list = sorted(range(len(feature_array)),
                           key=lambda x: feature_array[x],
                           reverse=True)
        test[t, nfeatures + 5:nfeatures + 5 + ntwobyte] = sorted(
            range(len(feature_array)), key=lambda x: sort_list[x])

        #        test_fourbyte[t] = map(float, row[nfeatures+1:nfeatures+nfourbyte+1]) if six.PY2 else list(map(float, row[nfeatures+1:nfeatures+nfourbyte+1]))
        Ids.append(row[0])
        if (t + 1) % 1000 == 0:
            print(t + 1, 'records loaded')
print('test set loaded')

# Predict for whole test set
#test[:,nfeatures+1:] = pca.transform(test_fourbyte)
#del test_fourbyte
y_pred = rfecv.predict_proba(test)
#y_pred = gscv2.predict_proba(test)
#y_pred = clf2.predict_proba(test)
# Writing results to file
with gzip.open(fsubmission, write_mode) as f:
    fw = writer(f)
    # Header preparation
    header = ['Id'] + ['Prediction' + str(i) for i in range(1, 10)]
    fw.writerow(header)
    for t, (Id, pred) in enumerate(zp(Ids, y_pred.tolist())):
        fw.writerow([Id] + pred)
        if (t + 1) % 1000 == 0:
            print(t + 1, 'prediction written')