Example #1
1
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=7,num_round=400,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
            yqq=y_pred*(1.0/(j+1))

            print j,llfun(y_test_cv,yqq)

        #y_pred/=m;
        clf=XGBClassifier(max_depth=10,colsample_bytree=0.8,learning_rate=0.02,n_estimators=500,nthread=-1)
        #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv),eval_metric="logloss",eval_set=[(X_test_cv, y_test_cv)])
        y_pred=clf.predict_proba(X_test_cv).T[1]
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
Example #2
1
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

# split df into test and training data
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

# use xgboost XGBClassifier 
xgb = XGBClassifier(max_depth=8, learning_rate=0.075, n_estimators=250,
                    objective='multi:softprob', subsample=0.75, colsample_bytree=0.85, seed=13)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  

# select the 5 highest probability classes
ids = []  # list ids
cts = []  # list countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

# generate output 'pysub.csv'
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('/Users/ianmurray/Documents/kaggle/airbnb/output/pysub.csv',index=False)
Example #3
0
def xgboostcv(max_depth,
              learning_rate,
              n_estimators,
              subsample,
              colsample_bytree,
              gamma,
              min_child_weight,
              silent=True,
              nthread=-1,
              seed=1234):

    clf = XGBClassifier(max_depth=int(max_depth),
                        learning_rate=learning_rate,
                        n_estimators=int(n_estimators),
                        silent=silent,
                        nthread=nthread,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree,
                        gamma=gamma,
                        min_child_weight = min_child_weight,
                        seed=seed,
                        objective="binary:logistic")

    clf.fit(x0, y0, eval_metric="logloss", eval_set=[(x1, y1)],early_stopping_rounds=25)
    ll = -log_loss(y1, clf.predict_proba(x1))
    return ll
def main():
    # Set seed for reproducibility
    np.random.seed(0)

    print("Loading data...")
    # Load the data from the CSV files
    
    training_data = pd.read_csv('/home/vipin/Videos/train.csv', header=0)
    prediction_data = pd.read_csv('/home/vipin/Videos/test.csv', header=0)
     
     
    training_data['countrycode']=training_data['countrycode'].apply(lambda x:ord(x))
    training_data['browserid']=training_data['browserid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") )
    training_data['devid']=training_data['devid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("none"))
    
    
    #pd.to_csv('/home/vipin/Videos/train11.csv', sep=',', encoding='utf-8')
    #exit(0)
    prediction_data['countrycode']=prediction_data['countrycode'].apply(lambda x:ord(x))
    prediction_data['browserid']=prediction_data['browserid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") )
    prediction_data['devid']=prediction_data['devid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("none") )
    
    
    features=['siteid','offerid','category','merchant','countrycode','browserid','devid']
    target="click"
    X = training_data[features]
    x_prediction = prediction_data[features]
    Y= training_data[target]
    ids = prediction_data["ID"]
    model = XGBClassifier()
            
            
    #linear_model.LogisticRegression(n_jobs=-1)
        
    print("Training...")
            # Your model is trained on the training_data
    model.fit(X, Y)
        
    print("Predicting...")
    
    seed =7
    test_size=0.33
    X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=test_size,random_state=seed)
    y_prediction = model.predict_proba(x_prediction)
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'probability':results})
    joined = pd.DataFrame(ids).join(results_df)
        
    y_pred=model.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    

    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Writing predictions to predictions.csv")
        # Save the predictions out to a CSV file
    joined.to_csv("/home/vipin/Videos/predictions.csv", index=False)
Example #5
0
def train_model_xgb(train_x, train_y, xgb_features):

    train_ind = StratifiedShuffleSplit(train_y, random_state=1, test_size=0.1)

    for train_index, test_index in train_ind:
        x_train = train_x.ix[train_index, :]
        y_train = train_y.ix[train_index]

        x_eval = train_x.ix[test_index, :]
        y_eval = train_y.ix[test_index]

    #Classifier
    xgb = XGBClassifier(max_depth=xgb_features['max_depth'], learning_rate=xgb_features['learning_rate'], n_estimators=int(xgb_features['n_estimators']), objective='binary:logistic',
                        subsample=xgb_features['subsample'], colsample_bytree=xgb_features['colsample_bytree'], min_child_weight=xgb_features['min_child_weight'])
    # gives 0.458
    xgb = xgb.fit(x_train, y_train, verbose=True, eval_metric='logloss',  eval_set=[(x_eval, y_eval)], early_stopping_rounds=10)

    predictions = pd.Series(xgb.predict_proba(x_train, ntree_limit=xgb.best_iteration)[:, 1], name='PredictedProb')

    return xgb, predictions
Example #6
0
def cv_BDT(input, output, params, show, channel, selection, names):

    # model = XGBClassifier()

    cvscores = []
    AUC = []

    cvscores_train = []
    AUC_train = []
    kfold = StratifiedKFold(5, True, 3456)
    for train, test in kfold.split(input, output):
        model = XGBClassifier(**params)
        X_train, X_test, y_train, y_test = (
            input[train],
            input[test],
            output[train],
            output[test],
        )
        model.fit(X_train, y_train)

        y_prob = model.predict_proba(X_test)
        y_pred = model.predict(X_test)
        prediction = [round(value) for value in y_pred]
        auc = roc_auc_score(y_test, y_prob[:, 1])
        accuracy = accuracy_score(y_test, prediction)
        print("Accuracy: %.2f%%; AUC = %.4f%%" % (accuracy * 100, auc))
        cvscores.append(accuracy * 100)
        AUC.append(auc)

        y_prob = model.predict_proba(X_train)
        y_pred = model.predict(X_train)
        prediction = [round(value) for value in y_pred]
        auc = roc_auc_score(y_train, y_prob[:, 1])
        accuracy = accuracy_score(y_train, prediction)
        print("Accuracy train: %.2f%%; AUC = %.4f%%" % (accuracy * 100, auc))
        cvscores_train.append(accuracy * 100)
        AUC_train.append(auc)

    print("Accuracy test = %.2f%% (+/- %.2f%%); AUC = %.4f (+/- %.4f)" %
          (np.mean(cvscores), np.std(cvscores), np.mean(AUC), np.std(AUC)))
    print("Accuracy train = %.2f%% (+/- %.2f%%); AUC = %.4f (+/- %.4f)" % (
        np.mean(cvscores_train),
        np.std(cvscores_train),
        np.mean(AUC_train),
        np.std(AUC_train),
    ))
    if show:

        name = "channel_" + str(channel) + "_BDT"
        name = "%s_%s" % (name, selection)
        modelname = "models/%s.h5" % name
        print("Save to %s" % modelname)
        plotter.plot_separation(model, X_test, y_test, name, False)
        plotter.plot_ROC(model, X_test, y_test, name, False)
        model.get_booster().feature_names = names
        mp.rc("figure", figsize=(5, 5))

        plot_importance(model.get_booster(),
                        max_num_features=15,
                        importance_type="gain")
        plt.subplots_adjust(left=0.3)
        plt.show()
     
     bst = XGBClassifier(objective = 'binary:logistic',
                         max_depth = 4,
                         learning_rate= 0.01,
                         subsample=0.8,
                         colsample_bytree=0.4,
                         n_estimators=1650,
                         min_child_weight=1,
                         silent=False)
     
     bst.fit(trainingSet[feature_names], np.array(trainingSet['TARGET']),
             eval_metric='auc',
             eval_set=[(trainingSet[feature_names], trainingSet['TARGET']), (validationSet[feature_names], validationSet['TARGET'])],
                       verbose=100)
     
     preds = bst.predict_proba(validationSet[feature_names])[:, 1]
     tmp = pd.DataFrame({'ID': validationSet['ID'],  pred_name: preds})
     eval_matrix = eval_matrix.append(tmp, ignore_index = True)
     
     del trainingSet, validationSet, bst, val_ids, idx
     gc.collect()         
 
 bst = XGBClassifier(objective = 'binary:logistic',
                         max_depth = 4,
                         learning_rate= 0.01,
                         subsample=0.8,
                         colsample_bytree=0.4,
                         n_estimators=1650,
                         min_child_weight=1,
                         silent=False,
                         nthread=-1)       
# Fit Random Forest classifier
RF = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',max_depth=8, max_features=6, max_leaf_nodes=None,min_impurity_decrease=0.0,
                            min_impurity_split=None,min_samples_leaf=1, min_samples_split=3,min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
                            oob_score=False, random_state=None, verbose=0,warm_start=False)
RF.fit(df_train.loc[:, df_train.columns != 'Exited'],df_train.Exited)
#########################################################################################
# Fit Extreme Gradient Boost Classifier
XGB = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,colsample_bytree=1, gamma=0.01, learning_rate=0.1, max_delta_step=0,max_depth=7,
                    min_child_weight=5, missing=None, n_estimators=20,n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,reg_alpha=0,
                    reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1)
XGB.fit(df_train.loc[:, df_train.columns != 'Exited'],df_train.Exited)
#########################################################################################
print(classification_report(df_train.Exited, log_primal.predict(df_train.loc[:, df_train.columns != 'Exited'])))
print(classification_report(df_train.Exited,  log_pol2.predict(df_train_pol2)))
print(classification_report(df_train.Exited,  SVM_RBF.predict(df_train.loc[:, df_train.columns != 'Exited'])))
print(classification_report(df_train.Exited,  SVM_POL.predict(df_train.loc[:, df_train.columns != 'Exited'])))
print(classification_report(df_train.Exited,  RF.predict(df_train.loc[:, df_train.columns != 'Exited'])))
print(classification_report(df_train.Exited,  XGB.predict(df_train.loc[:, df_train.columns != 'Exited'])))

y = df_train.Exited
X = df_train.loc[:, df_train.columns != 'Exited']
X_pol2 = df_train_pol2
auc_log_primal, fpr_log_primal, tpr_log_primal = get_auc_scores(y, log_primal.predict(X),log_primal.predict_proba(X)[:,1])
auc_log_pol2, fpr_log_pol2, tpr_log_pol2 = get_auc_scores(y, log_pol2.predict(X_pol2),log_pol2.predict_proba(X_pol2)[:,1])
auc_SVM_RBF, fpr_SVM_RBF, tpr_SVM_RBF = get_auc_scores(y, SVM_RBF.predict(X),SVM_RBF.predict_proba(X)[:,1])
auc_SVM_POL, fpr_SVM_POL, tpr_SVM_POL = get_auc_scores(y, SVM_POL.predict(X),SVM_POL.predict_proba(X)[:,1])
auc_RF, fpr_RF, tpr_RF = get_auc_scores(y, RF.predict(X),RF.predict_proba(X)[:,1])
auc_XGB, fpr_XGB, tpr_XGB = get_auc_scores(y, XGB.predict(X),XGB.predict_proba(X)[:,1])


gamma_loss = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]

for n in gamma_loss:
    model = XGBClassifier(max_depth=6,
                          n_estimators=459,
                          min_child_weight=1,
                          gamma=n,
                          learning_rate=0.1,
                          n_jobs=40)

    model.fit(X_train, y_train, sample_weight=w_train)

    train_pred_A = model.predict(X_train)
    print 'train_pred =', train_pred_A

    train_pred_proba_B = model.predict_proba(X_train)[:, 1]
    print 'train_pred_proba =', train_pred_proba_B

    false_positive_rate_A, true_positive_rate_A, thresholds_A = roc_curve(
        y_train, train_pred_A, sample_weight=w_train)
    roc_auc_A = auc(false_positive_rate_A, true_positive_rate_A)

    false_positive_rate_B, true_positive_rate_B, thresholds_B = roc_curve(
        y_train, train_pred_proba_B, sample_weight=w_train)
    roc_auc_B = auc(false_positive_rate_B, true_positive_rate_B)

    print 'Train results---------predict'
    train_results_A.append(roc_auc_A)
    print(n, train_results_A)

    print 'train results--------predict_proba'
Example #10
0
    scores = np.empty(n_splits)
    for i_fold, (fold_train_index,
                 fold_test_index) in enumerate(kf.split(X, y)):
        print("Training for " + early_stopping_string + " fold " +
              str(i_fold + 1) + "/" + str(n_splits))
        # Split train_index into train set and eval set for early stopping.
        fold_X = X.loc[fold_train_index, :]
        fold_y = y[fold_train_index]
        X_train, X_test, y_train, y_test = train_test_split(fold_X,
                                                            fold_y,
                                                            test_size=0.2,
                                                            stratify=fold_y)
        # eval_set: A list of (X, y) pairs to use as a validation set for early stopping
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], **fit_params)
        if early_stopping:
            # Get best iteration based on eval_set.
            sorted_iteration_scores = np.argsort(
                model.evals_result()['validation_0']['auc'])
            best_round = sorted_iteration_scores[-1]
            # Evaluate on test_index.
            proba = model.predict_proba(X.loc[fold_test_index, :],
                                        ntree_limit=best_round)[:, 1]
        else:
            proba = model.predict_proba(X.loc[fold_test_index, :])[:, 1]
        y_true = y[fold_test_index]
        scores[i_fold] = eval_gini(y_true, proba)
    # Report error.
    print('For ' + early_stopping_string +
          ', Gini score mean (standard deviation): ' + str(np.mean(scores)) +
          ' (' + str(np.sqrt(np.var(scores))) + ')')
                eval_set=[(trainingSet[feature_names], np.array(trainingSet["TARGET"])), (validationSet[feature_names], np.array(validationSet["TARGET"]))],
                         early_stopping_rounds=200,verbose=20)    
                          
        ll = gbm.best_score
        best_iter = gbm.best_iteration
        cv.append(ll)
        biter.append(best_iter)
        print "---auc : %0.6f\n" %ll
        print "---best_iter: %d\n" %best_iter
        gc.collect()
    
    gbm = XGBClassifier(max_depth=4,
                            learning_rate = 0.01,
                            n_estimators=370,
                            subsample=0.8,
                            colsample_bytree=0.5,
                            objective="binary:logistic",
                            silent = False,
                            min_child_weight=5,                       
                            nthread=-1)
                            
    gbm.fit(train[feature_names], np.array(train["TARGET"]),
            eval_metric = "auc",
            eval_set = [(train[feature_names], np.array(train["TARGET"]))],
                        verbose=20)                            
                        
    tpreds = gbm.predict_proba(test[feature_names])[:, 1]
    df = pd.DataFrame({"ID" : test["ID"], "TARGET" : tpreds })
    submission_name = "stacked_xgb_3.csv"
    df.to_csv(os.path.join(output_dir, submission_name), index = False)
Example #12
0
X = train_df[main_cols]
y = train_df.target.astype(int)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

model = XGBClassifier()
model.fit(X_train, y_train)

# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Optimize model parameters
# I run this code in google colab to make the execution much faster and use the best params in the next code
param_grid = {'min_child_weighth': [1, 5, 10],
        'gamma': [0.5, 1],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 5]
        }
model = GridSearchCV(model, param_grid,n_jobs=-1,verbose=2,cv=5)
model.fit(X_train, y_train)
print(model.best_params_)   

# Make predictions
y_pred = model.predict_proba(X_test)[:, 1]

# Check the auc score of the model
print(f'LGBM AUC score on the X_test is: {roc_auc_score(y_test, y_pred)}\n')

# print classification report
#print(classification_report(y_test, [1 if x >= 0.5 else 0 for x in y_pred]))
Example #13
0
 def predict(self, X):
     return XGBClassifier.predict_proba(self, X)[:,1]
Example #14
0
def leave_one_trial_out(x,
                        y,
                        log,
                        label_type,
                        normalization,
                        show_roc,
                        num_fold=15,
                        seed=0,
                        verbose=True,
                        select_top_k_feature=None):
    """Normalization and Leave one subject out cross validation

    Args:
        :param x: # people x # trials x (# channels x # features)
        :param y: # people x # trials
        :param log: (# channels x # features)
        :param label_type: {'rating', 'thought', 'withhold'}
        :param normalization: normalize feature or not
        :param show_roc: show ROC curve or not
        :param num_fold: do num_fold cross validation
        :param seed: use to fix the training and testing set
        :param verbose: show the result of each fold or not
        :param select_top_k_feature: select top k feature from training set, if None: use all features
    """
    if normalization:
        x = normalize(x)

    x = x.reshape(x.shape[0] * x.shape[1], -1)
    y = y.reshape(y.shape[0] * y.shape[1])

    x, y = convert_to_binary_label_and_remove_threshold(x,
                                                        y,
                                                        label_type=label_type)
    x, y = shuffle(x, y, random_state=seed)

    clf = XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        min_child_weight=2,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=1,
        reg_alpha=1,
    )

    test_fold_len = len(y) // num_fold
    precision_list, recall_list, f1_list = list(), list(), list()
    tprs, aucs, mean_fpr = [], [], np.linspace(0, 1, 100)

    print(
        'Start {}-fold leave one trial out cross validation'.format(num_fold))
    for fold in range(num_fold):
        # x.shape: # data x # features, y.shape: # data
        start_idx, end_idx = test_fold_len * fold, test_fold_len * (
            fold + 1)  # start and end idx of testing fold
        x_train, x_test = np.delete(x,
                                    np.arange(start_idx, end_idx, 1),
                                    axis=0), x[np.arange(
                                        start_idx, end_idx, 1)]
        y_train, y_test = np.delete(y,
                                    np.arange(start_idx, end_idx, 1),
                                    axis=0), y[np.arange(
                                        start_idx, end_idx, 1)]

        if select_top_k_feature is not None:
            feature_ranking = rank_feature(x_train, y_train)
            x_train = x_train[:, feature_ranking[:select_top_k_feature]]
            x_test = x_test[:, feature_ranking[:select_top_k_feature]]

        # train and predict
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)

        # plot roc curve of each fold (subject)
        probas_ = clf.predict_proba(
            x_test)  # shape: len x 2 (prob of neg, prob of pos)
        fpr, tpr, _ = roc_curve(y_test, probas_[:, 1])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr,
                 tpr,
                 lw=1,
                 alpha=0.3,
                 label='ROC fold {} (AUC={:.2f})'.format(fold, roc_auc))

        # be used to plot mean roc
        tprs.append(
            interp(mean_fpr, fpr,
                   tpr))  # append mean tpr (interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0  # mean_tpr[0] = 0
        aucs.append(roc_auc)

        # confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0,
                                                                  1]).ravel()
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)

        # used to calculate mean precision, recall, f1 score
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

        if verbose:
            print(
                'Test on fold {}: Precision->{:.2f}, Recall->{:.2f}, F1->{:.2f}'
                .format(fold + 1, precision, recall, f1))
    print('---------------------------')
    print('Average: Precision->{:.2f}, Recall->{:.2f}, F1->{:.2f}'.format(
        np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)))

    # plot mean auc
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    plt.plot(mean_fpr,
             mean_tpr,
             color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2,
             alpha=.8)

    # plot chance level roc
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             alpha=.8)  # plot chance level ROC
    plt.legend()
    if show_roc:
        plt.show()

    return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)
Example #15
0
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

data = pd.read_csv( 'C:\mldata\V2.csv',header = None,encoding = "ISO-8859-1")
data[9] = data[9].fillna(1)
y = data.loc[:,9].values
x = data.loc[:,0:7].values

test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)

model1 = XGBClassifier(n_estimators= 5000,max_depth=3,n_jobs=-1,seed = 1)
model1.fit(X_train, y_train)
# make predictions for test data
y_pred1 = model1.predict_proba(X_test)


model2 = XGBClassifier(n_estimators= 5000,max_depth=4, n_jobs=-1,seed = 2)
model2.fit(X_train, y_train)
# make predictions for test data
y_pred2 = model2.predict_proba(X_test)

model3 = XGBClassifier(n_estimators= 5000,max_depth=5, n_jobs=-1,seed = 3)
model3.fit(X_train, y_train)
# make predictions for test data
y_pred3 = model3.predict_proba(X_test)


model4 = XGBClassifier(n_estimators= 5000,max_depth=2 ,n_jobs=-1,seed = 4)
model4.fit(X_train, y_train)
Example #16
0
def leave_one_subject_out(x, y, log, label_type):
    """Normalization and Leave one subject out cross validation

    Args:
        :param x: # people x # trials x (# channels x # features)
        :param y: # people x # trials
        :param log: (# channels x # features)
    """
    x = normalize(x)

    clf = XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        min_child_weight=2,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=1,
        reg_alpha=1,
    )
    precision_list, recall_list, f1_list = list(), list(), list()
    tprs, aucs, mean_fpr = [], [], np.linspace(0, 1, 100)
    for subject in range(len(x)):
        x_train, y_train = np.delete(x, subject, axis=0), np.delete(y,
                                                                    subject,
                                                                    axis=0)
        x_test, y_test = x[subject], y[subject]

        # reshape x and y, and convert label to binary and remove threshold
        x_train, y_train = convert_to_binary_label_and_remove_threshold(
            x_train.reshape(-1, x.shape[2]), y_train.reshape(-1), label_type)
        x_test, y_test = convert_to_binary_label_and_remove_threshold(
            x_test.reshape(-1, x.shape[2]), y_test.reshape(-1), label_type)

        # train and predict
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)

        # plot roc curve of each fold (subject)
        probas_ = clf.predict_proba(
            x_test)  # shape: len x 2 (prob of neg, prob of pos)
        fpr, tpr, _ = roc_curve(y_test, probas_[:, 1])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr,
                 tpr,
                 lw=1,
                 alpha=0.3,
                 label='ROC fold {} (AUC={:.2f})'.format(subject, roc_auc))

        # be used to plot mean roc
        tprs.append(
            interp(mean_fpr, fpr,
                   tpr))  # append mean tpr (interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0  # mean_tpr[0] = 0
        aucs.append(roc_auc)

        # confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * (precision * recall) / (precision + recall)

        # used to calculate mean precision, recall, f1 score
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

        print(
            'Test on subject {}: Precision->{:.2f}, Recall->{:.2f}, F1->{:.2f}'
            .format(subject + 1, precision, recall, f1))

    print('---------------------------')
    print('Average: Precision->{:.2f}, Recall->{:.2f}, F1->{:.2f}'.format(
        np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)))

    # plot mean auc
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    plt.plot(mean_fpr,
             mean_tpr,
             color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2,
             alpha=.8)

    # plot chance level roc
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             alpha=.8)  # plot chance level ROC

    plt.show()
Example #17
0
    def cv_depth_weight(self):
        """
        Runs cross-validation by grid-searching through depth and child_weight values.
        """
        from xgboost import XGBClassifier

        for depth in self.depth_range:
            for child_weight in self.child_weight_range:
                all_predicted_probs = pd.DataFrame()
                all_testing_y = pd.Series()
                dates = []
                self.log_loss_weights = []
                for test_name in range(1, self.test_name + 1):
                    self.cv_start = self.cv_params[test_name]['cv_start']
                    self.cv_end = self.cv_params[test_name]['cv_end']
                    self.get_cv_indices()
                    training_x = self.full_df.loc[:(self.cv_indices[0] - 1),
                                                  self.feature_names]
                    self.training_y = self.full_df.loc[:(self.cv_indices[0] -
                                                         1), self.output_name]
                    scaler = StandardScaler()
                    scaler.fit(training_x)
                    training_x_scaled = scaler.transform(training_x)
                    testing_x = self.full_df[self.feature_names].loc[
                        self.cv_indices]
                    testing_x_scaled = scaler.transform(testing_x)
                    xgboost = XGBClassifier(max_depth=depth,
                                            min_child_weight=child_weight,
                                            gamma=0,
                                            learning_rate=0.1,
                                            n_estimators=100,
                                            reg_lambda=0.01,
                                            reg_alpha=0,
                                            subsample=1,
                                            colsample_bytree=1,
                                            objective='binary:logistic',
                                            booster='gbtree',
                                            silent=True,
                                            random_state=123)
                    xgboost.fit(X=training_x_scaled, y=self.training_y)

                    self.testing_y = self.full_df[self.output_name].loc[
                        self.cv_indices]
                    self.calculate_log_loss_weights()
                    predicted_probs = pd.DataFrame(
                        xgboost.predict_proba(testing_x_scaled))
                    all_predicted_probs = all_predicted_probs.append(
                        predicted_probs, ignore_index=True)
                    all_testing_y = all_testing_y.append(self.testing_y)
                    dates.extend(self.full_df['Dates'].loc[self.cv_indices])

                log_loss_score = log_loss(y_true=all_testing_y,
                                          y_pred=all_predicted_probs,
                                          sample_weight=self.log_loss_weights)
                if log_loss_score < self.best_cv_score:
                    self.best_cv_score = log_loss_score
                    self.optimal_depth = depth
                    self.optimal_child_weight = child_weight
                    self.xgboost_cv_predictions['Dates'] = dates
                    self.xgboost_cv_predictions[
                        'True'] = all_testing_y.to_list()
                    self.xgboost_cv_predictions[
                        'Predicted'] = all_predicted_probs[1].to_list()
Example #18
0
    def run_xgboost_prediction(self):
        """
        Performs prediction on the hold-out sample.
        """
        from xgboost import XGBClassifier

        self.optimal_depth = self.xgboost_optimal_params['Depth']
        self.optimal_child_weight = self.xgboost_optimal_params[
            'Min Child Weight']
        self.optimal_lambda = self.xgboost_optimal_params['Lambda']
        all_predicted_probs = pd.DataFrame()
        all_testing_y = pd.Series()
        dates = []
        self.log_loss_weights = []
        training_x = self.full_df.loc[:(self.pred_indices[0] - 1),
                                      self.feature_names]
        self.training_y = self.full_df.loc[:(self.pred_indices[0] - 1),
                                           self.output_name]
        scaler = StandardScaler()
        scaler.fit(training_x)
        training_x_scaled = scaler.transform(training_x)
        xgboost = XGBClassifier(max_depth=self.optimal_depth,
                                min_child_weight=self.optimal_child_weight,
                                gamma=0,
                                learning_rate=0.1,
                                n_estimators=100,
                                reg_lambda=self.optimal_lambda,
                                reg_alpha=0,
                                subsample=1,
                                colsample_bytree=1,
                                objective='binary:logistic',
                                booster='gbtree',
                                silent=True,
                                random_state=123)
        xgboost.fit(X=training_x_scaled, y=self.training_y)
        self.importances = pd.DataFrame(xgboost.feature_importances_).T
        self.importances.rename(columns=self.feature_dict, inplace=True)

        testing_x = self.full_df[self.feature_names].loc[self.pred_indices]
        testing_x_scaled = scaler.transform(testing_x)
        self.testing_y = self.full_df[self.output_name].loc[self.pred_indices]
        self.calculate_log_loss_weights()
        predicted_probs = pd.DataFrame(xgboost.predict_proba(testing_x_scaled))
        all_predicted_probs = all_predicted_probs.append(predicted_probs,
                                                         ignore_index=True)
        all_testing_y = all_testing_y.append(self.testing_y)
        dates.extend(self.full_df['Dates'].loc[self.pred_indices])

        self.xgboost_pred_error = log_loss(y_true=all_testing_y,
                                           y_pred=all_predicted_probs,
                                           sample_weight=self.log_loss_weights)
        self.xgboost_predictions['Dates'] = dates
        self.xgboost_predictions['True'] = all_testing_y.to_list()
        self.xgboost_predictions['Predicted'] = all_predicted_probs[1].to_list(
        )
        self.metadata['Importances'] = self.importances.to_dict()


#MIT License
#
#Copyright (c) 2019 Terrence Zhang
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.
     
     bst = XGBClassifier(max_depth=8,
                         learning_rate = 0.01,
                         n_estimators=2100,
                         subsample=0.9,
                         colsample_bytree=0.45,
                         objective="binary:logistic",
                         silent = False,
                         min_child_weight=1,                       
                         nthread=-1)
                             
     bst.fit(X_train, y_train, eval_metric= "logloss",
             eval_set=[(X_train, y_train), (X_valid, y_valid)],
                       verbose=200)
                      
     preds = bst.predict_proba(X_valid)[:, 1]
     ll = log_loss(validationSet["target"], preds)
     df = pd.DataFrame({"ID" : validationSet["ID"], pred_name : preds})
     eval_matrix = eval_matrix.append(df, ignore_index = True)
     print "fold : {} | logloss: {}".format(i+1, ll)        
     del trainingSet, validationSet, bst, preds, ll, X_train, X_valid, y_train, y_valid
     gc.collect()
 
 X_train = train[feature_names].copy()
 y_train = np.array(train["target"].copy())
 bst = XGBClassifier(max_depth=8,
                         learning_rate = 0.01,
                         n_estimators=2100,
                         subsample=0.9,
                         colsample_bytree=0.45,
                         objective="binary:logistic",
Example #20
0
print confusion_matrix(real, pred)

from sklearn.metrics import cohen_kappa_score
kappa = cohen_kappa_score(real, pred)

fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)

print 'Accuracy = ', float(cm[0][0] + cm[1][1]) / len(real)
print 'kappa score = ', kappa
print 'AUC Score = ', metrics.auc(fpr, tpr)
print 'recall = ', tpr[1]
print 'precision = ', float(cm[1][1]) / (cm[1][1] + cm[0][1])

#Getting the probability scores
predictions = model.predict_proba(X_test)
print predictions

addresses = housenum + ' ' + address

#Addresses with fire and risk score
risk = []
for row in predictions:
    risk.append(row[1])

cols = {
    "Address": addresses,
    "Fire": pred,
    "RiskScore": risk,
    "state_desc": state_desc,
    "school_desc": school_desc,
n_estimators = 600
max_depth = 6
subsample = 0.9
colsample_bytree = 0.85
min_child_weight = 1  # default

eval_metrics = ['auc']
eval_sets = [(X_train, y_train), (X_test, y_test)]
xgb = XGBClassifier(seed=0, learning_rate=learning_rate, n_estimators=n_estimators,
                    min_child_weight=min_child_weight, max_depth=max_depth,
                    colsample_bytree=colsample_bytree, subsample=subsample)
print("Fitting the model")
xgb = xgb.fit(X_train, y_train, eval_metric=eval_metrics, eval_set=eval_sets, verbose=False)
    
print("Predicting Probabilities")
probs['xgb'] = xgb.predict_proba(X_test)[:, -1]

print("Computing AUC")
auc_test = [xgb.evals_result_['validation_%d' % i]['auc'] for i in range(len(eval_sets))]
auc_test = np.array(auc_test, dtype=float).T

auc_best_round = np.argmax(auc_test, axis=0)
auc_best = [auc_test[auc_best_round[0], 0], auc_test[auc_best_round[1], 1]]

print('Best AUC train=%f (round=%d), test=%f (round=%d)' % (auc_best[0], auc_best_round[0], auc_best[1], auc_best_round[1]))
print('Validation')
test_probs = pd.DataFrame()
test_probs['xgb_valid'] = xgb.predict_proba(df_test)[:,-1]
print(test_probs['xgb_valid'].head())

fpr, tpr, thresholds = metrics.roc_curve(df_test_target, test_probs, pos_label=1)
Example #22
0
                        subsample=0.4,
                        subsample_freq=1,
                        colsample_bytree=0.4,
                        random_state=2019,
                        num_leaves=10,
                        min_child_samples=20,
                        max_depth=3)
clf_xgb.fit(train_X, train_y, \
        eval_set=[(train_X, train_y), (val_X, val_y)], \
        early_stopping_rounds=10)
joblib.dump(clf, 'treemodel/xgb.model')

# predict
print('predict...')
test_pred_prob_1 = clf.predict_proba(test_X, num_iteration=clf.best_iteration_)
test_pred_prob_2 = clf_xgb.predict_proba(test_X)
test_pred_prob = (test_pred_prob_1 + test_pred_prob_2) / 2
sub = pd.read_csv(path_data + file_test, parse_dates=['due_date'])
prob_cols = ['prob_{}'.format(i) for i in range(33)]
for i, f in enumerate(prob_cols):
    sub[f] = test_pred_prob[:, i]
sub_example = pd.read_csv('../result/submission_sample.csv',
                          parse_dates=['repay_date'])
sub_example = pd.merge(sub_example, sub, how='left', on='listing_id')
sub_example['days'] = (sub_example['due_date'] -
                       sub_example['repay_date']).dt.days
test_prob = sub_example[prob_cols].values
test_labels = sub_example['days'].values
test_prob = [test_prob[i][test_labels[i]] for i in range(test_prob.shape[0])]
sub_example['repay_amt'] = sub_example['due_amt'] * test_prob
sub_example[['listing_id', 'repay_date', 'repay_amt']].to_csv('sub.csv',
Example #23
0
print(roc_auc_score(y, preds))


# pick the best threshold out-of-fold
thresholds = np.linspace(0.01, 0.99, 50)
mcc = np.array([matthews_corrcoef(y, preds>thr) for thr in thresholds])
best_threshold = thresholds[mcc.argmax()]
print(mcc.max())

# load test data
test_X = np.concatenate([
    pd.read_csv("../Data/test_date.csv", index_col=0, dtype=np.float32,
                usecols=np.concatenate([[0], important_indices[important_indices<1156]+1])).values,
    pd.read_csv("../Data/test_numeric.csv", index_col=0, dtype=np.float32,
                usecols=np.concatenate([[0], important_indices[important_indices>=1156] +1 - 1156])).values
], axis=1)

# generate predictions at the chosen threshold
preds = (clf.predict_proba(test_X)[:,1] > best_threshold).astype(np.int8)


# and submit
location = '../Submission/'
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
filename = location + 'sub-' + timestr + '.csv.gz'

sub = pd.read_csv("../Data/sample_submission.csv", index_col=0)
sub["Response"] = preds
sub.to_csv(filename, compression="gzip")
Example #24
0
#Only two guys to a fight
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#Someone yells stop, goes limp, taps out, the fight is over
train.isnull().sum()
train = KNN(k=3).complete(train)
test = KNN(k=3).complete(test)

#One fight at a time
le = LabelEncoder()
cat = ['genre', 'certificate', 'distributor']
for col in cat:
    train[col] = le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])

#no shirts, no shoes
train_X = train.drop(['year', 'oscar', 'movie_name', 'actor_name', 'href'],
                     axis=1)
test_X = test.drop(['year', 'oscar', 'movie_name', 'actor_name', 'href'],
                   axis=1)
train_Y = train['oscar']

#Fights will go on as long as they want to
model = XGBClassifier()
model.fit(train_X, train_Y)

#If this is your first night at Fight Club, you have to fight.
pred_xgb = model.predict_proba(test_X)[:, 1]
xgb_prediction = pd.DataFrame(pred_xgb, test['movie_name'])
Example #25
0
class XGBClassifierCV(object):
    """cross_val_predict"""
    def __init__(self, params=None, cv=5, random_state=None, n_repeats=None):
        self.clf = XGBClassifier()
        if params:
            self.clf.set_params(**params)
        if n_repeats:
            self._kf = RepeatedStratifiedKFold(cv, True, random_state)
            self._num_preds = cv * n_repeats
        else:
            self._kf = StratifiedKFold(cv, True, random_state)
            self._num_preds = cv

    def fit(self,
            X,
            y,
            X_test=None,
            feval=roc_auc_score,
            sample_weight=None,
            eval_metric='auc',
            early_stopping_rounds=100,
            verbose=100,
            xgb_model=None,
            sample_weight_eval_set=None,
            callbacks=None):
        """输入数组"""
        if X_test is None:
            X_test = X[:1]

        self.oof_train = np.zeros(len(X))
        self.oof_test = np.zeros((len(X_test), self._num_preds))
        for n_fold, (train_index,
                     valid_index) in enumerate(self._kf.split(X, y)):
            if verbose:
                print("\033[94mFold %s started at %s\033[0m" %
                      (n_fold + 1, time.ctime()))
            X_train, y_train = X[train_index], y[train_index]
            X_valid, y_valid = X[valid_index], y[valid_index]
            eval_set = [(X_train, y_train), (X_valid, y_valid)]

            ########################################################################
            self.clf.fit(X_train, y_train, sample_weight, eval_set,
                         eval_metric, early_stopping_rounds, verbose,
                         xgb_model, sample_weight_eval_set)

            self.oof_train[valid_index] = self.clf.predict_proba(X_valid)[:, 1]
            self.oof_test[:, n_fold] = self.clf.predict_proba(X_test)[:, 1]
            ########################################################################

        # 输出 测试集 oof
        self.oof_test_rank = pd.DataFrame(self.oof_test).rank().mean(1) / len(
            self.oof_test)
        self.oof_test = self.oof_test.mean(1)

        # 计算 训练集 oof 得分
        if feval:
            oof_train_score = feval(y, self.oof_train)
            print(
                f"\n\033[94mCV Score: {oof_train_score} ended at {time.ctime()}\033[0m"
            )
            return oof_train_score

    def oof_save(self, file='./oof_train_and_test.csv'):
        assert isinstance(file, str)
        _ = np.append(self.oof_train, self.oof_test)
        pd.DataFrame(_, columns='oof_train_and_test').to_csv(file, index=False)
Example #26
0
def learn(title,
          data_loader,
          allow_printing,
          calculate_rhos,
          SVM,
          XGBOOST,
          NN,
          cross_validation,
          create_coeff_plots,
          check_all_parameters,
          svm_parameters,
          xgb_parameters,
          create_pca_plots,
          test_size,
          edge_percent,
          BINARY=True):
    # create a folder for the task
    if not os.path.exists(title):
        os.makedirs(title)
    os.chdir(os.path.join(os.path.abspath(os.path.curdir), title))

    print("learning..." + title)
    ids, tag_map, task_name = data_loader.get_learning_data(title)
    id_to_features_map = data_loader.get_id_to_features_map
    X = [
        id_to_features_map[id] for id in ids
        if id in id_to_features_map.keys()
    ]
    y = [tag_map[id] for id in ids if id in id_to_features_map.keys()]

    # ----------------------------------------------! calculate_rhos ------------------------------------------------
    if calculate_rhos:
        print("calculating rho")
        draw_rhos_calculation_figure(tag_map,
                                     data_loader.get_preproccessed_data,
                                     title,
                                     data_loader._taxnomy_level,
                                     ids_list=ids,
                                     save_folder="rhos")

    # ----------------------------------------------! PCA ------------------------------------------------
    if create_pca_plots:
        PCA_t_test(group_1=[x for x, y in zip(X, y) if y == 0],
                   group_2=[x for x, y in zip(X, y) if y == 1],
                   title="T test for PCA dimentions on " + task_name,
                   save=True,
                   folder="PCA")
        plot_data_3d(X,
                     y,
                     data_name=task_name.capitalize(),
                     save=True,
                     folder="PCA")
        plot_data_2d(X,
                     y,
                     data_name=task_name.capitalize(),
                     save=True,
                     folder="PCA")

    # ----------------------------------------------! SVM ------------------------------------------------
    # Set the parameters by cross-validation
    # multi_class =”crammer_singer”
    if SVM:
        if not os.path.exists("SVM"):
            os.makedirs("SVM")
        os.chdir(os.path.join(os.path.abspath(os.path.curdir), "SVM"))
        print("SVM...")

        # update each classifier results in a mutual file
        svm_results_file = Path("all_svm_results.csv")
        if not svm_results_file.exists():
            all_svm_results = pd.DataFrame(columns=[
                'KERNEL', 'GAMMA', 'C', 'TRAIN-AUC', 'TRAIN-ACC', 'TEST-AUC',
                'TEST-ACC'
            ])
            all_svm_results.to_csv(svm_results_file, index=False)

        optional_classifiers = []

        if check_all_parameters:
            svm_tuned_parameters = {
                'kernel':
                ['linear'],  ###### 'rbf', 'poly', 'sigmoid',  ??????????????
                'gamma': ['auto', 'scale'],
                'C': [0.01, 0.1, 1, 10, 100, 1000]
            }
            # create all possible classifiers
            weights = data_loader.get_weights()
            for kernel in svm_tuned_parameters['kernel']:
                for gamma in svm_tuned_parameters['gamma']:
                    for C in svm_tuned_parameters['C']:
                        clf = svm.SVC(
                            kernel=kernel,
                            C=C,
                            gamma=gamma,
                            class_weight=weights)  # class_weight='balanced')
                        optional_classifiers.append(clf)
        else:  # use the wanted classifier
            clf = svm.SVC(kernel=svm_parameters['kernel'],
                          C=svm_parameters['C'],
                          gamma=svm_parameters['gamma'],
                          class_weight='balanced')
            optional_classifiers.append(clf)

        for clf in optional_classifiers:
            all_svm_results = pd.read_csv(svm_results_file)
            clf_folder_name = "k=" + clf.kernel + "_c=" + str(
                clf.C) + "_g=" + clf.gamma
            if not os.path.exists(clf_folder_name):
                os.makedirs(clf_folder_name)
            # Split the data set
            X_trains, X_tests, y_trains, y_tests, svm_coefs = [], [], [], [], []
            svm_y_test_from_all_iter, svm_y_score_from_all_iter = np.array(
                []), np.array([])
            svm_y_pred_from_all_iter, svm_class_report_from_all_iter = np.array(
                []), np.array([])
            train_accuracies, test_accuracies, confusion_matrixes, y_train_preds, y_train_scores,\
            y_test_preds = [], [], [], [], [], []

            for i in range(cross_validation):
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=test_size, shuffle=True)
                X_trains.append(X_train)
                X_tests.append(X_test)
                y_trains.append(y_train)
                y_tests.append(y_test)

            bacteria_coeff_average = []

            for iter_num in range(cross_validation):
                print('------------------------------\niteration number ' +
                      str(iter_num))
                # FIT
                clf.fit(X_trains[iter_num], y_trains[iter_num])
                # GET RESULTS
                y_score = clf.decision_function(
                    X_tests[iter_num])  # what is this for?
                y_pred = clf.predict(X_tests[iter_num])
                y_test_preds.append(y_pred)
                svm_class_report = classification_report(
                    y_tests[iter_num], y_pred).split("\n")
                train_pred = clf.predict(X_trains[iter_num])
                train_score = clf.decision_function(X_trains[iter_num])
                y_train_preds.append(train_pred)
                y_train_scores.append(train_score)
                # SAVE RESULTS
                train_accuracies.append(
                    accuracy_score(y_trains[iter_num], train_pred))
                test_accuracies.append(
                    accuracy_score(y_tests[iter_num], y_pred))
                confusion_matrixes.append(
                    confusion_matrix(y_tests[iter_num], y_pred))
                # AUC
                if BINARY:
                    _, _, _, svm_roc_auc = roc_auc(y_tests[iter_num],
                                                   y_pred,
                                                   visualize=False,
                                                   graph_title='SVM\n' +
                                                   str(iter_num),
                                                   save=True,
                                                   folder=task_name)
                # SAVE y_test AND y_score
                svm_y_test_from_all_iter = np.append(
                    svm_y_test_from_all_iter, y_tests[iter_num])  # .values)
                svm_y_pred_from_all_iter = np.append(svm_y_pred_from_all_iter,
                                                     list(y_pred))
                svm_class_report_from_all_iter = np.append(
                    svm_class_report_from_all_iter, svm_class_report)
                if svm_y_score_from_all_iter.size > 0:
                    svm_y_score_from_all_iter = np.concatenate(
                        (svm_y_score_from_all_iter, y_score), axis=0)
                else:
                    svm_y_score_from_all_iter = y_score
                # --------------------------------------------! COEFF PLOTS -----------------------------------------
                if create_coeff_plots:
                    svm_coefs, bacterias, coefficients, bacteria_coeff_average = svm_calc_bacteria_coeff_average(
                        data_loader, clf, svm_coefs, bacteria_coeff_average)

            # --------------------------------------------! AUC -----------------------------------------
            all_y_train, all_predictions_train, all_test_real_tags, all_test_pred_tags, train_auc, test_auc, train_rho, \
            test_rho = calc_auc_on_joined_results(cross_validation, y_trains, y_train_preds, y_tests, y_test_preds)

            # ----------------------------------------! CONFUSION MATRIX -------------------------------------
            print("\n------------------------------")
            names = data_loader.get_confusin_matrix_names()
            # binary = len(names) == 2
            confusion_matrix_average, confusion_matrix_acc = edit_confusion_matrix(
                confusion_matrixes, "SVM", names, BINARY=BINARY)
            if BINARY:
                _, _, _, svm_roc_auc = roc_auc(
                    svm_y_test_from_all_iter.astype(int),
                    svm_y_score_from_all_iter,
                    visualize=True,
                    graph_title='SVM\n' + task_name.capitalize() +
                    " AUC on all iterations",
                    save=True,
                    folder=clf_folder_name)
                res_path = os.path.join(clf_folder_name,
                                        str(round(svm_roc_auc, 5)))
            else:
                svm_roc_auc = 0
                res_path = clf_folder_name

            if not os.path.exists(res_path):
                os.mkdir(res_path)

            if create_coeff_plots:
                plot_bacteria_coeff_average(bacteria_coeff_average, len(names),
                                            data_loader, title, task_name,
                                            bacterias, cross_validation, "SVM",
                                            res_path, BINARY, edge_percent)

            # if allow_printing:
            print_confusion_matrix(confusion_matrix_average, names,
                                   confusion_matrix_acc, "SVM", task_name,
                                   res_path)

            t = np.array(y_trains).astype(int)
            t = t.flatten()
            s = np.array(y_train_scores)
            s = s.flatten()

            if BINARY:
                _, _, _, svm_train_roc_auc = roc_auc(t,
                                                     s,
                                                     visualize=False,
                                                     graph_title="train auc",
                                                     save=False,
                                                     folder=res_path)
            else:
                svm_train_roc_auc = 0
                multi_class_roc_auc(svm_y_test_from_all_iter.astype(int),
                                    svm_y_score_from_all_iter,
                                    names,
                                    graph_title='SVM\n' +
                                    task_name.capitalize() +
                                    " AUC on all iterations",
                                    save=True,
                                    folder=res_path)
            # ----------------------------------------! SAVE RESULTS -------------------------------------
            save_results(task_name, train_auc, test_auc, train_rho, test_rho,
                         confusion_matrix_average, confusion_matrix_acc,
                         train_accuracies, test_accuracies,
                         svm_y_score_from_all_iter, svm_y_pred_from_all_iter,
                         svm_y_test_from_all_iter, "SVM", res_path)

            all_svm_results.loc[len(all_svm_results)] = [
                clf.kernel, clf.C, clf.gamma, svm_train_roc_auc,
                np.mean(train_accuracies), svm_roc_auc,
                np.mean(test_accuracies)
            ]
            if BINARY:
                all_svm_results = all_svm_results.sort_values(by=['TEST-AUC'],
                                                              ascending=False)
            else:
                all_svm_results = all_svm_results.sort_values(by=['TEST-ACC'],
                                                              ascending=False)

            all_svm_results.to_csv(svm_results_file, index=False)

    # ----------------------------------------------! XGBOOST ------------------------------------------------
    if XGBOOST:
        if SVM:
            os.chdir("..")
        if not os.path.exists("XGBOOST"):
            os.makedirs("XGBOOST")

        os.chdir(os.path.join(os.path.abspath(os.path.curdir), ("XGBOOST")))

        print("XGBOOST...")

        # update each classifier results in a mutual file
        xgb_results_file = Path("all_xgb_results.csv")
        if not xgb_results_file.exists():
            all_xgb_results = pd.DataFrame(columns=[
                'LR', 'MAX-DEPTH', 'N-ESTIMATORS', 'OBJECTIVE', 'GAMMA',
                'MIN-CHILD-WEIGHT', 'BOOSTER', 'TRAIN-AUC', 'TRAIN-ACC',
                'TEST-AUC', 'TEST-ACC'
            ])
            all_xgb_results.to_csv(xgb_results_file, index=False)

        optional_classifiers = []

        if check_all_parameters:
            """
            xgboost_tuned_parameters = {'learning_rate': [0.01, 0.05, 0.1],
                                         'objective': ['binary:logistic'],
                                         'n_estimators': [1000],
                                         'max_depth': range(3, 10),
                                         'min_child_weight': range(1, 12),
                                         'gamma': [0.0, 0.1, 0.2, 0.3, 1, 3, 6, 9]}
            """
            xgboost_tuned_parameters = {
                'learning_rate': [0.01, 0.05, 0.1],
                'objective': ['binary:logistic'],
                'n_estimators': [1000],
                'max_depth': [3, 5, 7, 9],
                'min_child_weight': [1, 5, 9],
                'gamma': [0.0, 0.5, 1, 5, 9]
            }
            # create all possible classifiers
            for max_depth in xgboost_tuned_parameters['max_depth']:
                for learning_rate in xgboost_tuned_parameters['learning_rate']:
                    for n_estimators in xgboost_tuned_parameters[
                            'n_estimators']:
                        for objective in xgboost_tuned_parameters['objective']:
                            for gamma in xgboost_tuned_parameters['gamma']:
                                for min_child_weight in xgboost_tuned_parameters[
                                        'min_child_weight']:
                                    clf = XGBClassifier(
                                        max_depth=max_depth,
                                        learning_rate=learning_rate,
                                        n_estimators=n_estimators,
                                        objective=objective,
                                        gamma=gamma,
                                        min_child_weight=min_child_weight,
                                        booster='gblinear')
                                    optional_classifiers.append(clf)
        else:  # use the wanted classifier
            clf = XGBClassifier(
                max_depth=xgb_parameters['max_depth'],
                learning_rate=xgb_parameters['learning_rate'],
                n_estimators=xgb_parameters['n_estimators'],
                objective=xgb_parameters['objective'],
                gamma=xgb_parameters['gamma'],
                min_child_weight=xgb_parameters['min_child_weight'],
                booster='gblinear')
            optional_classifiers.append(clf)

        for clf in optional_classifiers:
            all_xgb_results = pd.read_csv(xgb_results_file)
            clf_folder_name = "d=" + str(clf.max_depth) + "_lr=" + str(clf.learning_rate) + "_e=" +\
                              str(clf.n_estimators) + "_o=" + clf.objective + "_g=" + str(clf.gamma) + "_m=" +\
                              str(clf.min_child_weight) + "_b=" + clf.booster
            if not os.path.exists(clf_folder_name):
                os.makedirs(clf_folder_name)

            # Split the data set
            X_trains, X_tests, y_trains, y_tests, xgb_coefs = [], [], [], [], []
            xgb_y_test_from_all_iter, xgb_y_score_from_all_iter = np.array(
                []), np.array([])
            xgb_y_pred_from_all_iter, xgb_class_report_from_all_iter = np.array(
                []), np.array([])
            xgb_coefs, bacteria_coeff_average, y_train_scores = [], [], []
            train_accuracies, test_accuracies, confusion_matrixes, y_train_preds, y_test_preds = [], [], [], [], []

            for i in range(cross_validation):
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=test_size, shuffle=True, stratify=y)
                X_trains.append(X_train)
                X_tests.append(X_test)
                y_trains.append(y_train)
                y_tests.append(y_test)

            for iter_num in range(cross_validation):
                print("------------------------------\niteration number " +
                      str(iter_num))

                classes_sum = [
                    np.sum(np.array(y_trains[iter_num]) == unique_class)
                    for unique_class in np.unique(np.array(y_trains[iter_num]))
                ]
                classes_ratio = [
                    1 - (a / sum(classes_sum)) for a in classes_sum
                ]
                weights = [
                    classes_ratio[a] for a in np.array(y_trains[iter_num])
                ]  # classes_ratio

                clf.fit(np.array(X_trains[iter_num]),
                        np.array(y_trains[iter_num]),
                        sample_weight=weights)
                clf.predict_proba(X_tests[iter_num])
                y_score = clf.predict_proba(
                    X_tests[iter_num])  # what is this for?
                y_pred = clf.predict(X_tests[iter_num])
                y_test_preds.append(y_pred)
                y_pred = clf.predict(X_tests[iter_num])
                xgb_class_report = classification_report(
                    y_tests[iter_num], y_pred)
                train_pred = clf.predict(X_trains[iter_num])
                train_score = clf.predict_proba(X_trains[iter_num])
                y_train_preds.append(train_pred)
                y_train_scores.append(train_score)

                train_accuracies.append(
                    accuracy_score(y_trains[iter_num],
                                   clf.predict(X_trains[iter_num])))
                test_accuracies.append(
                    accuracy_score(
                        y_tests[iter_num],
                        y_pred))  # same as - clf.score(X_test, y_test)
                confusion_matrixes.append(
                    confusion_matrix(y_tests[iter_num], y_pred))

                if BINARY:
                    _, _, _, xgb_roc_auc = roc_auc(y_tests[iter_num],
                                                   y_pred,
                                                   visualize=True,
                                                   graph_title='XGB\n' +
                                                   str(iter_num),
                                                   folder=task_name)
                else:
                    xgb_roc_auc = 0

                # save the y_test and y_score
                xgb_y_test_from_all_iter = np.append(xgb_y_test_from_all_iter,
                                                     y_tests[iter_num])
                xgb_y_pred_from_all_iter = np.append(xgb_y_pred_from_all_iter,
                                                     y_pred)
                xgb_class_report_from_all_iter = np.append(
                    xgb_class_report_from_all_iter, xgb_class_report)
                if xgb_y_score_from_all_iter.size > 0:
                    xgb_y_score_from_all_iter = np.concatenate(
                        (xgb_y_score_from_all_iter, y_score), axis=0)
                else:
                    xgb_y_score_from_all_iter = y_score
                # --------------------------------------! PLOT CORRELATION - XGBOOST -------------------------------
                # if create_coeff_plots:
                #     num_of_classes, bacterias = xgb_calc_bacteria_coeff_average(data_loader, clf, xgb_coefs,
                #                                                             bacteria_coeff_average)
            # if create_coeff_plots:
            #     plot_bacteria_coeff_average(bacteria_coeff_average, num_of_classes, data_loader, title, task_name,
            #                                 bacterias, cross_validation, "XGB")

            all_y_train, all_predictions_train, all_test_real_tags, all_test_pred_tags, train_auc, test_auc, train_rho, \
            test_rho = calc_auc_on_joined_results(cross_validation, y_trains, y_train_preds, y_tests, y_test_preds)

            names = data_loader.get_confusin_matrix_names()
            confusion_matrix_average, confusion_matrix_acc = \
                edit_confusion_matrix(title, confusion_matrixes, data_loader, "XGB", names, BINARY=BINARY)

            if BINARY:
                _, _, _, xgb_roc_auc = roc_auc(
                    xgb_y_test_from_all_iter.astype(int),
                    xgb_y_score_from_all_iter[:, 1],
                    visualize=True,
                    graph_title='XGB\n' + task_name.capitalize() +
                    " AUC on all iterations",
                    save=True,
                    folder=clf_folder_name)
                res_path = os.path.join(clf_folder_name,
                                        str(round(xgb_roc_auc, 5)))

            else:
                xgb_roc_auc = 0
                res_path = clf_folder_name

            if not os.path.exists(res_path):
                os.mkdir(res_path)

            # if allow_printing:
            print_confusion_matrix(confusion_matrix_average, names,
                                   confusion_matrix_acc, "XGB", task_name,
                                   res_path)

            t = np.array(y_trains).astype(int)
            t = t.flatten()
            s = np.array(y_train_scores)
            s = s.flatten()
            c = s[::2]

            if BINARY:
                _, _, _, xgb_train_roc_auc = roc_auc(t,
                                                     c,
                                                     visualize=False,
                                                     graph_title="",
                                                     save=False,
                                                     folder=res_path)
            else:
                xgb_train_roc_auc = 0
                multi_class_roc_auc(xgb_y_test_from_all_iter.astype(int),
                                    xgb_y_score_from_all_iter,
                                    names,
                                    graph_title='XGB\n' +
                                    task_name.capitalize() +
                                    " AUC on all iterations",
                                    save=True,
                                    folder=res_path)
            # ----------------------------------------! SAVE RESULTS -------------------------------------

            save_results(task_name, train_auc, test_auc, train_rho, test_rho,
                         confusion_matrix_average, confusion_matrix_acc,
                         train_accuracies, test_accuracies,
                         xgb_y_score_from_all_iter, xgb_y_pred_from_all_iter,
                         xgb_y_test_from_all_iter, "XGB", res_path)

            all_xgb_results.loc[len(all_xgb_results)] = [
                clf.learning_rate, clf.max_depth, clf.n_estimators,
                clf.objective, clf.gamma, clf.min_child_weight, clf.booster,
                xgb_train_roc_auc,
                np.mean(train_accuracies), xgb_roc_auc,
                np.mean(test_accuracies)
            ]
            if BINARY:
                all_xgb_results = all_xgb_results.sort_values(by=['TEST-AUC'],
                                                              ascending=False)
            else:
                all_xgb_results = all_xgb_results.sort_values(by=['TEST-ACC'],
                                                              ascending=False)

            all_xgb_results.to_csv(xgb_results_file, index=False)

    # ----------------------------------------------! NN ------------------------------------------------

    if NN:
        if SVM or XGBOOST:
            os.chdir("..")
        if not os.path.exists("NN"):
            os.makedirs("NN")

        param_dict = {
            "lr": [0.005],
            "test_size": [0.2],
            "batch_size": [16],
            "shuffle": [True],
            "num_workers": [4],
            "epochs": [100]
        }

        for lr in param_dict['lr']:
            for test_size in param_dict['test_size']:
                for batch_size in param_dict['batch_size']:
                    for shuffle in param_dict['shuffle']:
                        for num_workers in param_dict['num_workers']:
                            for epochs in param_dict['epochs']:
                                clf_folder_name = "lr=" + str(lr) + "_t=" + str(test_size) + "_bs=" +\
                                                  str(batch_size) + "_s=" + str(shuffle) + "_nw=" +\
                                                  str(num_workers) + "_e=" + str(epochs)
                                if not os.path.exists(clf_folder_name):
                                    os.makedirs(clf_folder_name)
                                nn_main(X, y, title, clf_folder_name, 46, 200,
                                        100, 1, lr, test_size, batch_size,
                                        shuffle, 4, 100)
    os.chdir("../..")
     biter.append(best_iter)
     print "---log_loss: %0.6f\n" %ll
     print "---best_iter: %d\n" %best_iter
     gc.collect()
 
 best_i = np.mean(biter) + 50
 # train on whole data
 gbm = XGBClassifier(max_depth=8,
                     learning_rate = 0.01,
                     n_estimators=best_i,
                     subsample=0.9,
                     colsample_bytree=0.45,
                     objective="binary:logistic",
                     silent = False,
                     min_child_weight=1,                       
                     nthread=-1)
 
 gbm.fit(train_processed, target, eval_metric="logloss",
         eval_set = [(train_processed, target)],
                     verbose=20)                        
 
 tid = test_processed["ID"].copy()
 assert (len(tid) == 114393), "test length does not match!"
 test_processed.drop(["ID", "target", "train_flag"], axis = 1, inplace = True)
 tpreds = gbm.predict_proba(test_processed)[:, 1]
 sub = pd.DataFrame({"ID" : tid, "PredictedProb" : tpreds})
 submission_file = os.path.join(submission_dir, "xgb_denormalized.csv")
 sub.to_csv(submission_file, index = False)
 
 end_time = datetime.now()
 print 'elapsed time: {}'.format(end_time - start_time)
                                                                test_size=1 /
                                                                3,
                                                                random_state=0)

for subsample in np.arange(0.5, 1, 0.05):
    i = i + 1
    xgb = XGBClassifier(subsample=subsample, early_stopping_rounds=100)
    xgb.fit(X_train,
            y_train,
            early_stopping_rounds=100,
            eval_metric="auc",
            eval_set=[(X_trainVal, y_trainVal), (X_testVal, y_testVal)],
            verbose=100)
    y_pred_rm_xgb = xgb.predict(X_test)
    # get roc/auc info
    Y_score = xgb.predict_proba(X_test)[:, 1]
    fpr = dict()
    tpr = dict()
    fpr, tpr, _ = roc_curve(y_test, Y_score)
    auc_fit = auc(fpr, tpr)

    roc_auc_df_subs.loc[i] = [subsample, auc_fit]

#El mejor es un subsample de 1
roc_auc_df_subs = roc_auc_df_subs.drop_duplicates()
plt.style.use('seaborn-pastel')
plt.plot(roc_auc_df_subs.subsample, roc_auc_df_subs.auc_fit)
plt.title('AUCROC vs Subsample')
plt.xlabel('Número de subsample')
plt.ylabel('AUCROC')
plt.show()
class XGBoostModel:
    def __init__(self, use_rfc=True):
        self.use_rfc = use_rfc
        if self.use_rfc:
            # Instantiate Random Forest Classifier
            self.rfc = RFCModel()
            self.rfc.unpickle()

    def load_train_data(self):
        self.df, y, _ = clean_df('data/data.json', training=True)

        if self.use_rfc:
            # Include results from random forest classifier as new column
            rfc_probs = self.rfc.predict_proba_all()
            self.df['rfc_proba'] = rfc_probs

        X = self.df.values

        self.features = self.df.columns
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.20, stratify=y, random_state=42)

    def load_test_data(self):
        self.df, _, oid = clean_df('data/data_point.json', training=False)

        if self.use_rfc:
            # Include results from random forest classifier as new column
            rfc_probs = self.rfc.predict_proba('data/data_point.json')
            self.df['rfc_proba'] = rfc_probs

        return self.df.values, oid

    def load_one(self, one_json):
        # with open('one.json', 'w') as f:
        #     temp = '[' + one_json + ']'
        #     f.write(temp)

        self.df, _, oid = clean_df('[' + one_json + ']', training=False)

        if self.use_rfc:
            # Include results from random forest classifier as new column
            rfc_probs = self.rfc.predict_proba('data/data_point.json')
            self.df['rfc_proba'] = rfc_probs

        return self.df.values, oid

    def fit(self):
        self.model = XGBClassifier(max_depth=8,\
                                # reg_alpha=.8,\
                                n_estimators=200,\
                                scale_pos_weight=10.13,\
                                learning_rate=0.1)

        self.model.fit(self.X_train, self.y_train)

    @property
    def feature_importances_(self):
        #I couldn't call the master class, so just copy-n-pasted
        #See https://github.com/dmlc/xgboost/commit/dd477ac903eb6f658d6fb2984763c3f8a4516389#diff-2c197a11c1b576e821f5942be9eab74c
        b = self.model.booster()
        fs = b.get_fscore()
        all_features = [fs.get(f, 0.) for f in b.feature_names]
        all_features = np.array(all_features, dtype=np.float32)
        return all_features / all_features.sum()

    def plot_features(self, save_img_dir=None, img_name_prefix='', ext='svg'):
        '''
        use ext='svg' for web!
        add save_file_dir location to save images
        save_file_dir has NO trailing slash!
        eg 'static/images'
        to keep multiple images saved add prefix string
        prefix will be added to image file name

        '''

        # this is needed to fix lable clipping in saved files
        from matplotlib import rcParams
        rcParams.update({'figure.autolayout': True})

        #severly modified from https://gist.github.com/light94/6c42df29f3232ae31e52
        b = self.model.booster()
        fs = b.get_fscore()
        #print('feature...')
        #print(b.feature_names)
        #all_features = {f:fs.get(f, 0.) for f in b.feature_names}
        #need to add real feature names
        all_features = {
            self.features[i]: float(fs.get('f' + str(i), 0.))
            for i in range(len(b.feature_names))
        }
        importance = sorted(all_features.items(), key=itemgetter(1))

        ff = pd.DataFrame(importance, columns=['feature', 'fscore'])
        ff['fscore'] = ff['fscore'] / ff['fscore'].sum()

        #"plot 1"
        ax = ff.fscore.plot(xticks=ff.index, rot=65)
        ax.set_xticklabels(ff.feature)
        plt.title('XGBoost F-scores by feature')

        if save_img_dir is not None:
            plt.savefig('{}/{}feature_fscores.{}'.format(
                save_img_dir, img_name_prefix, ext))
        plt.show()

        #"plot 2"
        ff.plot(kind='barh',
                x='feature',
                y='fscore',
                legend=False,
                figsize=(6, 10))
        plt.title('XGBoost Feature Importance')
        plt.xlabel('relative importance')
        if save_img_dir is not None:
            plt.savefig('{}/{}features_barh.{}'.format(save_img_dir,
                                                       img_name_prefix, ext))
        plt.show()
        plt.close()

    def pickle(self):
        _pickle(self.model, 'data/XGBoostModel.pkl')

    def unpickle(self):
        self.model = _unpickle('data/XGBoostModel.pkl')

    def score(self):
        y_pred = self.model.predict(self.X_test)
        probs = self.model.predict_proba(self.X_test)[:, 1]
        accuracy = accuracy_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
        print("f1: %.2f" % f1)
        print('Confusion matrix')
        print(np.array([['TN', 'FP'], ['FN', 'TP']]))
        print(confusion_matrix(self.y_test, y_pred))

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        prob = self.model.predict_proba(X)
        return prob[:, 1]
Example #30
0
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_valid, predictions)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=le.classes_,
                      title='Confusion matrix, without normalization')
plt.show()


predictions_test = clfr.predict_proba(test_X)

predictions_test = pd.DataFrame(predictions_test).reset_index(drop=True)

submission_df = pd.concat([test_df['id'],predictions_test], axis=1).reset_index(drop=True)

submission_df = submission_df.rename(columns = le_name_mapping)

submission_df.to_csv('xgb1.csv', index=False)
Example #31
0
                      colsample_bytree=1,
                      subsample=1)


# In[20]:


model_A.fit(X_train_A, y_train_A)
model_B.fit(X_train_B, y_train_B)
model_C.fit(X_train_C, y_train_C)


# In[21]:


a_preds = model_A.predict_proba(X_test_A)
b_preds = model_B.predict_proba(X_test_B)
c_preds = model_C.predict_proba(X_test_C)


# In[ ]:


def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)
Example #32
0
nikkei_pred_lag = model_lag.predict(poly_nyse_test_lag)

# In[29]:

plt.scatter(return_datas_test['NYSE'], return_datas_test['NIKKEI'])
plt.plot(nyse_new[:, 0], nikkei_pred, 'r')
plt.plot(nyse_test_new[:, 0], nikkei_test_pred, 'g')
plt.legend(['Predicted line', 'Test data', 'Observed data'])
plt.show()

# In[ ]:

from xgboost import XGBClassifier
# XGBoost is an implementation of gradient boosted decision trees
xgmodel = XGBClassifier(max_depth=6,
                        learning_rate=0.1,
                        n_estimators=100,
                        n_jobs=16,
                        scale_pos_weight=4,
                        missing=np.nan,
                        gamma=16,
                        eval_metric="auc",
                        reg_lambda=40,
                        reg_alpha=40)
xgmodel.fit(nikkei_train, nyse_train)

# In[ ]:

from sklearn.metric import roc_auc_score
y_train_predcted = xgmodel.predict_proba()
# L1 stacking would be improved by actually doing another proper kfold

# gbm w/ same params
std = StandardScaler()
dataset_blend_train = std.fit_transform(dataset_blend_train)
dataset_blend_test = std.transform(dataset_blend_test)
X_train_l1 = np.hstack([X_train, dataset_blend_train])
X_test_l1 = np.hstack([X_test, dataset_blend_test])

print 'GBM L1'
gbm_l1 = XGBClassifier(seed=0, learning_rate=gbm_learning_rate, n_estimators=gbm_n_estimators,
                       min_child_weight=gbm_min_child_weight, max_depth=gbm_max_depth,
                       colsample_bytree=gbm_colsample_bytree, subsample=gbm_subsample)
gbm_l1.fit(X_train_l1, y_train)
print 'GBM L1 AUC: %f' % roc_auc_score(y_train, gbm_l1.predict_proba(X_train_l1)[:, -1])

# nn w/ same params
nn_l1 = Sequential()
nn_l1.add(Dense(32, input_shape=(X_train_l1.shape[1],), activation='sigmoid'))
nn_l1.add(Dropout(0.25))
nn_l1.add(Dense(32, activation='sigmoid'))
nn_l1.add(Dropout(0.25))
nn_l1.add(Dense(1, activation='sigmoid'))

opt = SGD(lr=nn_sgd_lr, decay=nn_sgd_decay, momentum=nn_sgd_momentum, nesterov=True)
nn_l1.compile(loss='binary_crossentropy', optimizer=opt)

print 'NN L1'
nn_l1.fit(X_train_l1, y_train, verbose=0, nb_epoch=100)
print 'NN L1 AUC: %f' % roc_auc_score(y_train, nn_l1.predict_proba(X_train_l1)[:, -1])
#model = linear_model.LogisticRegression(C=1e5)
model.fit(X_train, y_train)

pred_type = input('Predict on avr or ind? ')
if pred_type == 'ind':

    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("###########################################")
    print("###########################################")

    from sklearn.metrics import roc_curve, auc
    probs = model.predict_proba(X_test)
    preds = probs[:, 0]
    fpr, tpr, threshold = roc_curve(y_test, preds)
    roc_auc = auc(fpr, tpr)

    # method I: plt
    import matplotlib.pyplot as plt
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.ion()
#                    'xgb__learning_rate': (0.01, 0.03, 0.05),
#                    'xgb__colsample_bytree': (0.8, 0.85)
#                 }
#
#    grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=1, scoring='roc_auc', cv=3)
#    grid_search.fit(scaled_X_train, y_train)
#    print 'Best score: %.3f'%grid_search.best_score_
#    print 'Best parameters set:'
#    best_parameters = grid_search.best_estimator_.get_params()
#    for param_name in sorted(parameters.keys()):
#        print '\t%s: %r' %(param_name, best_parameters[param_name])
#
#    predictions = grid_search.predict(scaled_X_test)
#    print classification_report(y_test, predictions)
#
#    for param_name in parameters.keys():
#        xgb_args[param_name[5:]] = best_parameters[param_name]
#
#    print 'xgb_args:', xgb_args

    final_scaler = preprocessing.StandardScaler()
    scaled_final_train_df = final_scaler.fit_transform(final_train_df)
    scaled_final_test_df = final_scaler.transform(final_test_df)

    classifier = XGBClassifier(**xgb_args)
    classifier.fit(scaled_final_train_df, final_targets_df)
    output = classifier.predict_proba(scaled_final_test_df)[:,1]

    S = Series(output, index=Ids)
    S.to_csv('Santander_xgboost_results_1.csv', header=True, index_label=['ID', 'TARGET'])
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

print(xgb.score(X_test, y_test))
print(log_loss(y_test, xgb.predict(X_test)))
print(f1_score(y_test, xgb.predict(X_test)))

    
importance2 = xgb.feature_importances_

for i,v in enumerate(importance2):
	print('Feature: %0d, Score: %.5f' % (i,v))


import matplotlib.pyplot as plt

plt.bar([x for x in range(len(importance2))], importance2)
plt.show()


XGB = XGBClassifier()
XGB.fit(X,y)
y_pred = XGB.predict_proba(df_test)

Y = pd.DataFrame(y_pred)

Y.to_excel("output1.xlsx",index=False)



Example #37
0
max_score = 0
for i in range(5,10):
    model = XGBClassifier(max_depth=i)
    kf = KFold(len(y),n_folds=5,random_state=42, shuffle=True)
    #Using accuracy because of final table using it measure
    score = cross_val_score(model, X, y, cv=kf, scoring='accuracy').mean()
    print('Cross validation score =', score)
    print('max_depth =', i)
    if score > max_score:
        max_score = score
        max_n = i
print('Max Cross validation score =',max_score)
print('Max max_depth =', max_n)
model = XGBClassifier(max_depth=max_n)
model.fit(X,y)
prediction = model.predict_proba(test_pred)

#Just to see what features are important and what are not
print(model.feature_importances_)

#Step 3. Save data to file.
submission = pd.DataFrame({
    "ID": test["ID"],
    "Adoption": prediction[:,0],
    "Died": prediction[:,1],
    "Euthanasia": prediction[:,2],
    "Return_to_owner": prediction[:,3],
    "Transfer": prediction[:,4]

})
Example #38
0
CV_accuracy = accuracies.mean()
CV_std = accuracies.std()

# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{'reg_lambda' : [0.1, 0.5, 1, 2, 5, 10, 30, 50],
               'n_estimators' : [50, 75, 100, 300, 301],
               'learning_rate' : [0.01, 0.02, 0.05, 0.1, 0.5, 1],
               'max_depth' : [3, 4, 5, 6, 8, 10],
               'subsample' : [0.1, 0.2, 0.5, 0.75, 0.85, 1]}
             ]

grid_search = GridSearchCV(estimator = classifier1, 
                           param_grid = parameters,
                           scoring = "neg_log_loss",
                           cv = 10, n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_metric = grid_search.best_score_
best_params = grid_search.best_params_

# Predicting the Test Set results
y_pred1 = classifier1.predict_proba(X_test)[:,1]
y_pred2 = classifier2.predict_proba(X_test)[:,1]
y_pred_NN = classifier_NN.predict(X_test)

# Creating predictions from ensemble models
ensemble1 = ((0.25*y_pred1) + (0.75*y_pred_NN).T).T

# Writing the results to a csv file
np.savetxt('results.csv', ensemble1)
Example #39
0
				  Xg_train, 
 				  num_boost_round = clf.get_params()['n_estimators'],
 				  nfold = 5,
 				  show_progress = True,
				  early_stopping_rounds = 100)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(X_train, y_train)
best_outcome_params = clf.get_params()
best_outcome_score = cvresult.min()

try:
	# predict the outcome probabilities
	y_pred = grid.predict_proba(X_test)
except:
	# predict the outcome probabilities
	y_pred = clf.predict_proba(X_test)


# Create a data frame
column_names = possible_outcomes[:]
idx = pd.Int64Index(np.arange(1,11457, dtype='int64'))
idx.rename('ID', inplace=True)
df = pd.DataFrame(index = idx, data=y_pred, columns=column_names)

# write it to file, timestamp it
import time, datetime
ts = time.time()
submission_time_stamp = datetime.datetime.fromtimestamp(ts).strftime('%Y.%m.%d.%H.%M.%S')
df.to_csv('./Data/xgb_submission_'+submission_time_stamp+'.csv',header=True)

# save parameters to file:
Example #40
0
            train.loc[train_series.isnull(), train_name] = -9999 #train_series.mean()
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len>0:
            test.loc[test_series.isnull(), test_name] = -9999 #train_series.mean()  #TODO

X_train = train
X_test = test

extc = XGBClassifier(max_depth=10,colsample_bytree=0.8,learning_rate=0.02,n_estimators=500,nthread=-1)#max_features= 50,criterion= 'entropy',min_samples_split= 4,
                            #max_depth= 50, min_samples_leaf= 4)      
y_test=pd.read_csv('good/xgb4.csv')['real'].values

extc.fit(X_train,target,eval_metric="logloss",eval_set=[(X_test, y_test)]) 

print('Predict...')
y_pred = extc.predict_proba(X_test)
#print y_pred

pd.DataFrame({"ID": id_test, "PredictedProb": y_pred[:,1]}).to_csv('mycv1.csv',index=False)
y=pd.read_csv('good/xgb4.csv')['real'].values
yp=y_pred[:,1]
score=str(llfun(y,yp))[2:]
print sys.argv[0],score
import subprocess
cmd='cp mycv1.csv vabackup/mycv%s.csv'%score
subprocess.call(cmd,shell=True)
cmd='cp mycv.py vabackup/mycv%s.py'%score
subprocess.call(cmd,shell=True)

Example #41
0
model_cnt = 0
XGBmodels = []
seeds = [0, 1000]
for one in seeds:
    for max_depth in [3]:
        for learning_rate in [0.05]:
            model_cnt += 1
            model = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=500, silent=True, \
                      objective='binary:logistic', nthread=-1, gamma=0, min_child_weight=1, \
                      max_delta_step=0, subsample=1, colsample_bytree=0.8, colsample_bylevel=0.8, \
                      reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=one, missing=None)
            XGBmodels.append([model, 50, 5, 'xgb'+str(model_cnt)])
            model_cnt += 1
            XGBmodels.append([model, 20, 5, 'xgb'+str(model_cnt)])
            layer_2_valid['xgb_fe'] += model.predict_proba(valid_data[cols].as_matrix())[:, 0]
            
model_cnt += 1
model = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=None, \
           min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, \
           max_features=0.2, max_leaf_nodes=None, min_impurity_split=1e-07, \
           bootstrap=True, oob_score=False, n_jobs=30, random_state=None, verbose=0, \
           warm_start=False, class_weight=None)
XGBmodels.append([model, 50, 5, 'rf'+str(model_cnt)])

LRmodels = []
seeds = [0]
Cs = [0.15]
tols = [0.0001]

model_cnt = 0
Example #42
0
def main():
    args = parse_args()
    config = parse_config(args.config_file)
    if config is None:
        print('No configuration file is defined. '
              'Define one with `--config-file`.')
        sys.exit(1)

    if args.plot_dir is not None:
        if not os.path.isdir(args.plot_dir):
            os.mkdir(args.plot_dir)

    index_cols = config['index_features']
    event_cols = config['unique_event_features']

    # this will be the training dataframe
    if args.input_file:
        merged_training_df = read_root(args.input_file, stop=args.stop)
        merged_training_df.set_index(index_cols, inplace=True)
        # duplicates may have ended up in the root file
        len_before = len(merged_training_df)
        merged_training_df.drop_duplicates(inplace=True)
        print(f'Dropped {(1 - len(merged_training_df) / len_before) * 100:.5f}%'
              ' duplicated entries in dataframe')
    else:
        merged_training_df = read_full_files(args, config)

    # in every case, define a proper target
    merged_training_df['target'] = merged_training_df.eval(config['target_eval'])

    # sort for performance
    merged_training_df.sort_index(inplace=True)

    print_avg_tagging_info(merged_training_df, config)

    mva_features = config['mva_features']
    total_event_number = get_event_number(config)
    selected_event_number = (merged_training_df.groupby(
        event_cols).SigYield_sw.head(1).sum())

    # build BDT model and train the classifier nBootstrap x 3 times
    xgb_kwargs = config['xgb_kwargs']
    n_jobs = config['n_jobs']

    sorting_feature = config['sorting_feature']

    bootstrap_roc_aucs = []
    bootstrap_scores = []
    bootstrap_d2s = []
    bootstrap_roc_curves = []
    bootstrap_calibration_params = []
    nBootstrap = args.n_bootstrap or config['n_bootstrap']
    print('Starting bootstrapping.')
    pbar = tqdm(total=nBootstrap * 6)
    for _ in range(nBootstrap):
        # yield 3-fold split for CV
        df_sets = [merged_training_df.iloc[indices]
                   for indices in NSplit(merged_training_df)]
        # try to compensate for slow subset creation
        pbar.update(3)

        for i in range(3):
            df1, df2, df3 = (df_sets[i % 3],
                             df_sets[(i + 1) % 3],
                             df_sets[(i + 2) % 3])
            model = XGBClassifier(nthread=n_jobs, **xgb_kwargs)
            model.fit(df1[mva_features], df1.target,
                      sample_weight=df1.SigYield_sw)
            roc1 = roc_auc_score(df1.target,
                                 model.predict_proba(df1[mva_features])[:, 1])

            probas = model.predict_proba(df2[mva_features])[:, 1]
            roc2 = roc_auc_score(df2.target, probas)

            # calibrate
            calibrator = PolynomialLogisticRegression(power=3,
                                                      solver='lbfgs',
                                                      n_jobs=n_jobs)
            calibrator.fit(probas.reshape(-1, 1), df2.target,
                           sample_weight=df2.SigYield_sw)
            bootstrap_calibration_params.append(calibrator.lr.coef_)

            probas = model.predict_proba(df3[mva_features])[:, 1]
            calib_probas = calibrator.predict_proba(probas)[:, 1]
            roc3 = roc_auc_score(df3.target, calib_probas)

            # concatenating here, since df3 is a view on the main df and will
            # throw warnings when adding any columns to it
            df3 = pd.concat([
                    df3.reset_index(),
                    pd.Series(calib_probas, name='calib_probas'),
                ], axis=1)
            best_indices = df3.groupby(event_cols)[sorting_feature].idxmax()
            best_particles = df3.loc[best_indices]

            bootstrap_roc_aucs.append([roc1, roc2, roc3])
            score = tagging_power_score(best_particles, config,
                efficiency=selected_event_number/total_event_number,
                etas='calib_probas')
            if args.plot_dir is not None:
                fpr, tpr = roc_curve(best_particles.target,
                    best_particles.calib_probas,
                    sample_weight=best_particles.SigYield_sw)[:2]
                bootstrap_roc_curves.append([fpr, tpr])

            bootstrap_scores.append(score)
            bootstrap_d2s.append(d2_score(best_particles.calib_probas,
                    sample_weight=best_particles.SigYield_sw))
            pbar.update(1)
    pbar.close()

    # pickle bootstrap results
    with open('crossval_training_dump.pkl', 'bw') as f:
        pickle.dump(dict(
            roc_curves=bootstrap_roc_curves,
            tagging_power_scores=bootstrap_scores,
            d2_scores=bootstrap_d2s,
            ), f)

    # plot roc curve on request
    if args.plot_dir is not None:
        print('Plotting ROC curves...', end=' ')
        curve_points = np.array(bootstrap_roc_curves)

        # hacky test for correct roc curve shapes
        min_roc_shape = np.min([len(a[0]) for a in curve_points])
        fprs, tprs = [], []
        for fpr, tpr in curve_points:
            fprs.append(fpr[:min_roc_shape])
            tprs.append(tpr[:min_roc_shape])
        fprs = np.array(fprs)
        tprs = np.array(tprs)
        plt.style.use('ggplot')
        plt.rcParams['figure.figsize'] = (6, 6)
        plt.rcParams['font.size'] = 12
        plt.plot([0, 1], '--', label='random')
        plt.plot(fprs.mean(axis=0), tprs.mean(axis=0), label='Mean ROC curve')
        plt.fill_between(fprs.mean(axis=0),
                         tprs.mean(axis=0) - tprs.std(axis=0),
                         tprs.mean(axis=0) + tprs.std(axis=0),
                         label=r'$\pm 1 \sigma$ area',
                         alpha=0.4)
        plt.xlim(-0.05, 1.05)
        plt.ylim(0, 1.05)
        plt.text(1, 0.05, 'LHCb unofficial',
                 verticalalignment='bottom', horizontalalignment='right')
        plt.legend(loc='best')
        plt.xlabel('false positive rate')
        plt.ylabel('true positive rate')
        filename = os.path.join(args.plot_dir, 'ROC-curves.pdf')
        plt.savefig(filename, bbox_inches='tight')
        print('done.')

    d2 = 100 * ufloat(np.mean(bootstrap_d2s), np.std(bootstrap_d2s))
    eff = 100 * ufloat(np.mean(noms(bootstrap_scores)),
                       np.std(noms(bootstrap_scores)))
    print(dedent(f"""
          CalibrationParams:
          {np.array(bootstrap_calibration_params).mean(axis=0)}
          {np.array(bootstrap_calibration_params).std(axis=0)}
          ROC AUCs:
          {np.array(bootstrap_roc_aucs).mean(axis=0)}
          {np.array(bootstrap_roc_aucs).std(axis=0)}
          Final {nBootstrap}-fold bootstrap performance
             D2 = {d2}%
          ε_eff = {eff}%"""))
                               n_jobs=-1,
                               cv=kfold)
    result = grid_search.fit(np.array(features), labels)
    # summarize results
    print("Best: %f using %s" % (result.best_score_, result.best_params_))
    means, stdevs = [], []
    for params, mean_score, scores in result.grid_scores_:
        stdev = scores.std()
        means.append(mean_score)
        stdevs.append(stdev)
        print("%f (%f) with: %r" % (mean_score, stdev, params))

    ### final training
    #features,labels = get_training_data()
    model = XGBClassifier(learning_rate=0.1,
                          max_depth=8,
                          min_child_weight=200,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          objective='binary:logistic',
                          scale_pos_weight=1.0,
                          seed=27)
    model.fit(np.array(features), labels)

    ### final prediction
    ids, test_x = get_testing_data()
    predicted_y = model.predict_proba(np.array(test_x))
    predicted_is = predicted_y[:, 1]
    write_results(ids, predicted_is, fname='rs4.csv')
Example #44
0
          sample_weight=w_train,                                # instance weights
          eval_set = [(x_train,y_train), (x_val,y_val)],        # a list of (X,y) tuple pairs to use as validation sets ---> validation_0=train, validation_1=validation
          sample_weight_eval_set = [w_train, w_val],            # list of arrays storing instances weights for the i-th validation set
          eval_metric = ['auc', 'error'],                       # list of parameters under eval_metric: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
          early_stopping_rounds=50,                             # validation metric needs to improve at least once in every early_stopping_rounds round(s)
          verbose=100)

results = model.evals_result()                            # takes the results from the BDT training above
n_estimators = len(results['validation_0']['error'])      # number of rounds used for the BDT training
auc_train = results['validation_0']['auc']                # subsample: auc for training
auc_val = results['validation_1']['auc']                  # subsample: auc for validation
error_train = results['validation_0']['error']            # subsample: error for training
error_val = results['validation_1']['error']              # subsample: error for validation

# save the bdt result to our dataframe
df.loc[:,'bdt_score'] = model.predict_proba(df[variables])[:,1]
df_overlay.loc[:,'bdt_score'] = model.predict_proba(df_overlay[variables])[:,1]

# ==================== #
#      MAKE PLOTS      #
# ==================== #

printtitle('Making plots...')

# --- plot auc and error for training and validation

plt.figure(figsize=(15,5))

plt.subplot(121)
plt.plot(range(0,n_estimators), auc_train, c='blue', label='train')
plt.plot(range(0,n_estimators), auc_val, c='orange', label='validation')
Example #45
0
def main():
    args = parse_args()
    config = parse_config(args.config_file)
    if config is None:
        print('No configuration file is defined. '
              'Define one with `--config-file`.')
        sys.exit(1)

    # read dataset
    files = config['files']
    if 'filepath' in config:
        files = [config['filepath'] + f for f in files]
    kwargs = config['pandas_kwargs']

    print('Reading ', end='')
    entries = 0
    for f in files:
        rootfile = ROOT.TFile(f)
        tree = rootfile.Get(kwargs['key'])
        entries += tree.GetEntries()
    maxslices = args.max_slices
    chunksize = kwargs['chunksize']
    total = (maxslices
             if maxslices is not None and maxslices < (entries / chunksize)
             else (entries / chunksize))
    print(total * chunksize, 'events.')
    df = pd.concat([
        df for df in tqdm(
            islice(
                read_root(files, flatten=True, **kwargs), maxslices),
            total=total)])

    # rename the tagging particle branches
    df.rename(columns=dict(zip(df.columns,
        [c.replace(config['tagging_particle_prefix'], 'tp').replace('-', '_')
            for c in df.columns])),
        inplace=True)
    df['event_id'] = df.runNumber.apply(str) + '_' + df.eventNumber.apply(str)
    if 'invert_target' in config and config['invert_target']:
        df['target'] = np.sign(df.B_ID) != np.sign(df.tp_ID)
    else:
        df['target'] = np.sign(df.B_ID) == np.sign(df.tp_ID)

    # read features and selections
    try:
        if 'inclusive_mva_features' in config:
            mva_features = ['tp_' + f for f in config['inclusive_mva_features']]
        else:
            mva_features = ['tp_' + f.split(' ')[0] for f in config['selections']]
    except:
        raise ValueError('Tried to parse features for the BDT.'
                         ' Either provide well-formatted `selections` or'
                         ' define a `inclusive_mva_features` set.')

    # build BDT model and train the classifier n_cv x 3 times
    xgb_kwargs = config['xgb_kwargs']
    n_jobs = config['n_jobs']

    bootstrap_scores = []
    bootstrap_d2s = []
    nfold = (args.bootstrap_folds
             if args.bootstrap_folds is not None
             else config['n_cv'])
    print('Starting bootstrapping.')
    pbar = tqdm(total=nfold * 3)
    for _ in range(nfold):
        # yield 3-fold split for CV
        df_sets = [df.iloc[indices] for indices in NSplit(df)]

        cv_scores = []
        for i in range(3):
            df1, df2, df3 = (df_sets[i % 3].copy(),
                             df_sets[(i + 1) % 3].copy(),
                             df_sets[(i + 2) % 3].copy())
            model = XGBClassifier(nthread=n_jobs, **xgb_kwargs)
            sample_weight = (df1.target
                             if 'training_weights' in config
                                and config['training_weights']
                             else None)
            model.fit(df1[mva_features], df1.target,
                      sample_weight=df1.SigYield_sw)

            df2['probas'] = model.predict_proba(df2[mva_features])[:, 1]
            df2.reset_index(inplace=True, drop=True)
            df2_max = df2.iloc[df2.groupby('event_id')['probas'].idxmax()].copy()
            df3['probas'] = model.predict_proba(df3[mva_features])[:, 1]
            df3.reset_index(inplace=True, drop=True)
            df3_max = df3.iloc[df3.groupby('event_id')['probas'].idxmax()].copy()

            # calibrate
            calibrator = PolynomialLogisticRegression(power=4,
                                                      solver='lbfgs',
                                                      n_jobs=n_jobs)
            calibrator.fit(df2_max.probas.reshape(-1, 1), df2_max.target,
                           sample_weight=df2_max.SigYield_sw)

            df3_max['calib_probas'] = calibrator.predict_proba(df3_max.probas)[:, 1]

            score = tagging_power_score(df3_max.calib_probas,
                                        tot_event_number=get_event_number(df3_max),
                                        sample_weight=df3_max.SigYield_sw)
            bootstrap_scores.append(score)
            bootstrap_d2s.append(d2_score(df3_max.calib_probas,
                                          sample_weight=df3_max.SigYield_sw))
            pbar.update(1)

    pbar.close()
    print(dedent("""\
          Final {}-fold bootstrap performance
             D2 = {:<6}%
          ε_eff = {:<6}%""")
          .format(nfold,
                  100 * ufloat(np.mean(bootstrap_d2s),
                               np.std(bootstrap_d2s)),
                  100 * ufloat(np.mean(noms(bootstrap_scores)),
                               np.std(noms(bootstrap_scores)))))
Example #46
0
print('We have %d classes and %d models TOTAL so in resulting arrays \
we expect to see %d columns.' % (n_classes, len(models_1) + len(models_2), 
                                 n_classes * (len(models_1) + len(models_2))))

# Create empty arrays
S_train_all = np.zeros((X_train.shape[0], 0))
S_test_all = np.zeros((X_test.shape[0], 0))

# Load results
for name in sorted(glob('*.npy')):
    print('Loading: %s' % name)
    S = np.load(name)
    S_train_all = np.c_[S_train_all, S[0]]
    S_test_all = np.c_[S_test_all, S[1]]
    
print('\nS_train_all shape:', S_train_all.shape)
print('S_test_all shape: ', S_test_all.shape)

# Initialize 2nd level model
model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                      n_estimators=100, max_depth=3)
    
# Fit 2nd level model
model = model.fit(S_train_all, y_train)

# Predict
y_pred = model.predict_proba(S_test_all)

# Final prediction score
print('Final prediction score: %.8f' % log_loss(y_test, y_pred))
Example #47
0
 def saveTrainingsDev(self, df, imp_features, save_pred=False):
     logging.debug(
         "inside saveTrainingsDev Module of XgbSelection Class . dictionary is :  {}"
         .format(self.dictionary))
     print('Iterating on different hyper parameters..')
     version = self.version
     out = df.loc[:, self.dictionary['id'] + self.dictionary['performance']]
     out['actual'] = df[self.dictionary['target'][0]]
     summary_df = pd.DataFrame()
     identifier = str(len(imp_features)) + 'var'
     alias = {
         'n_estimators': 'est',
         'max_depth': 'max_dep',
         'subsample': 'sub_s',
         'learning_rate': 'learn_r',
         'colsample_bytree': 'col_samp',
         'reg_lambda': 'lambda',
         'gamma': 'gamma',
         'min_child_weight': 'mcw'
     }
     for idx, row in self.params_df.astype(object).iterrows():
         print('Iteration {0} of {1}'.format(idx + 1,
                                             self.params_df.shape[0]))
         tup = [
             i for i in zip([
                 alias.get(row.index[j])
                 for j in range(len(self.params_df.columns))
             ], row.values.astype(str))
         ]
         params_str = [''.join(t) for t in tup]
         identifier = identifier + '_'.join(params_str) + '_' + version
         param = row.to_dict()
         #model = XGBClassifier(seed = 10, **params, nthread = 10)
         model = XGBClassifier(seed=10,
                               learning_rate=param['learning_rate'],
                               colsample_bytree=param['colsample_bytree'],
                               n_estimators=param['n_estimators'],
                               subsample=param['subsample'],
                               max_depth=param['max_depth'],
                               gamma=param['gamma'],
                               min_child_weight=param['min_child_weight'],
                               nthread=10)
         model.fit(df.loc[:, imp_features],
                   df[self.dictionary['target'][0]])
         joblib.dump(
             model, self.dictionary['path'] + '/' + 'saved_objects/xgb_' +
             identifier)
         feature_imp = pd.DataFrame({
             'feature_names': imp_features,
             'importance': model.feature_importances_
         })
         feature_imp.to_csv(self.dictionary['path'] + '/' +
                            'results/feature_importance_' + identifier +
                            '.csv',
                            index=False)
         score = model.predict_proba(df.loc[:, imp_features])
         if save_pred:
             out['pred'] = score[:, 1]
             out.to_csv(self.dictionary['path'] + '/' +
                        'results/pred_dev_' + identifier + '.csv',
                        index=False)
         ks = self.ksTable(score[:, 1], df[self.dictionary['target'][0]],
                           'dev_xgb_' + identifier)
         breaks = np.diff(ks['No.Res']) > 0
         dec_break = (np.diff(ks['No.Res']) > 0).any()
         ks_val = ks.KS.max()
         ks_decile = ks.KS.idxmax() + 1
         capture = ks['percent_cum_res'][3]
         if dec_break:
             break_dec = min([idx for idx, x in enumerate(breaks) if x]) + 2
             summary_df = summary_df.append(
                 pd.DataFrame([
                     list(row.values) +
                     [ks_val, break_dec, ks_decile, capture]
                 ],
                              columns=list(row.index) + [
                                  'dev_ks', 'dev_ro_break', 'dev_ks_decile',
                                  'dev_capture'
                              ]))
         else:
             break_dec = np.nan
             summary_df = summary_df.append(
                 pd.DataFrame([
                     list(row.values) +
                     [ks_val, break_dec, ks_decile, capture]
                 ],
                              columns=list(row.index) + [
                                  'dev_ks', 'dev_ro_break', 'dev_ks_decile',
                                  'dev_capture'
                              ]))
         identifier = str(len(imp_features)) + 'var'
     summary_df.to_csv(self.dictionary['path'] + '/' +
                       'results/summary_df_params_xgb_' + version + '.csv',
                       index=False)
     logging.debug(
         "saveTrainingsDev module of XgbSelection Class executed successfully. summary is :{} "
         .format(summary_df))
     logging.debug(" dictionary is :{} ".format(self.dictionary))
Example #48
0
# ada = CustomizedAdaBoostClassifier(n_estimators=100)
# ada.fit(X, y)
# result0_tmp = ada.predict(X_test)

d_tree = DecisionTreeClassifier(max_depth=8)
d_tree.fit(X, y)
result1 = d_tree.predict_proba(X_test)

G = GradientBoostingClassifier(max_depth=6, n_estimators=150)
G.fit(X, y)
result2 = G.predict_proba(X_test)

xg = XGBClassifier(max_depth=8, n_estimators=100)
xg.fit(X, y)
result3 = xg.predict_proba(X_test)

threshold = 0.1
threshold_dict = {}
while threshold < 0.95:
    print('===========\nthreshold: ', threshold)
    result1_tmp = list(map(lambda x: 0 if x[0] > threshold else 1, result1))
    result2_tmp = list(map(lambda x: 0 if x[0] > threshold else 1, result2))
    result3_tmp = list(map(lambda x: 0 if x[0] > threshold else 1, result3))
    final_result_list = [result1_tmp, result2_tmp, result3_tmp]
    train_profit, tpp, opf, ofp, off = customize_acc(y_test, ensemble(final_result_list))
    print(threshold, train_profit, tpp, opf, ofp, off, tpp / ofp, (tpp + ofp) / (tpp + opf + ofp + off))
    final_df = pandas.DataFrame({'predict_y': ensemble(final_result_list)})
    final_df.to_csv(str(threshold) + '_jan_pred_result.csv',index=None)
    threshold = threshold + 0.05
Example #49
0
X=np.hstack([train[good+goodx].as_matrix(),train1.as_matrix()])
Xt=np.hstack([test[good+goodx].as_matrix(),test1.as_matrix()])

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
names_categorical = []
cand=['v40','v63','v109']
for name in train.columns.values :
    if train[name].value_counts().shape[0]<1000 or name in cand:# and name not in good:
        train[name] = map(str, train[name])
        test[name] = map(str, test[name])
        names_categorical.append(name)
        print name,train[name].value_counts().shape[0] 
X_sparse = vec.fit_transform(train[names_categorical].T.to_dict().values())
Xt_sparse = vec.transform(test[names_categorical].T.to_dict().values())

idx=np.array(train.index)
del train
gc.collect()
X=sparse.hstack([X,X_sparse],format='csr')#.toarray()
Xt=sparse.hstack([Xt,Xt_sparse],format='csr')
print X.shape,y.shape,Xt.shape
clf=XGBClassifier(max_depth=11,colsample_bytree=0.5,learning_rate=0.01,n_estimators=1200,nthread=-1)
clf.fit(X,y)
idx=np.array(test.index)#id_test
yp=clf.predict_proba(Xt).T[1]
s=pd.DataFrame({idname:idx,'PredictedProb':yp})
s.to_csv('xgb10.csv',index=False)

Example #50
0
    def pdpVarReduction(self, devset, valsets, valnames, feature_names,
                        params):
        logging.debug("inside pdpVarReduction Module of XgbSelection Class .")
        print('Reducing variable using partial dependency plots ..')
        y = self.dictionary['target'][0]
        version = self.version
        save_loc = self.dictionary['path']
        num_flat = len(feature_names)
        nonflat = feature_names
        summary_df_pdp = pd.DataFrame()
        dct = collections.OrderedDict(params)
        identifier = '_'.join([
            list(dct.keys())[i] + str(list(dct.values())[i])
            for i in range(len(dct.keys()))
        ])
        X_train = devset

        while num_flat > 0:
            summary_df = pd.DataFrame()
            curr_X = X_train[nonflat]
            target = X_train[y]
            #model = XGBClassifier(seed=10, **params, nthread=10)
            model = XGBClassifier(seed=10,
                                  learning_rate=params['learning_rate'],
                                  colsample_bytree=params['colsample_bytree'],
                                  n_estimators=params['n_estimators'],
                                  subsample=params['subsample'],
                                  max_depth=params['max_depth'],
                                  gamma=params['gamma'],
                                  min_child_weight=params['min_child_weight'],
                                  nthread=10)
            model.fit(curr_X, target)
            joblib.dump(model,
                        self.dictionary['path'] + '/' +
                        'saved_objects/xgb_nonflat_pdp_' + version + '_' +
                        identifier + '_' + str(len(nonflat)) + '.joblib',
                        compress=1)
            feature_imp = pd.DataFrame({
                'feature_names': nonflat,
                'importance': model.feature_importances_
            })
            feature_imp.to_csv(self.dictionary['path'] + '/' +
                               'results/feature_importance_nonflat_pdp_' +
                               version + '_' + identifier + '_' +
                               str(len(nonflat)) + '.csv',
                               index=False)
            score = model.predict_proba(curr_X)
            ks = self.ksTable(
                score[:, 1], target, 'dev' + '_xgb_nonflat_pdp_' + version +
                '_' + identifier + '_' + str(len(nonflat)))
            breaks = np.diff(ks['No.Res']) > 0
            dec_break = (np.diff(ks['No.Res']) > 0).any()
            ks_val = ks.KS.max()
            ks_decile = ks.KS.idxmax() + 1
            #Top 3 decile capture
            capture = ks['percent_cum_res'][3]
            if dec_break:
                break_dec = min([idx for idx, x in enumerate(breaks) if x]) + 2
                summary_df = summary_df.append(
                    pd.DataFrame(
                        [[len(nonflat), ks_val, break_dec, ks_decile, capture]
                         ],
                        columns=[
                            'feature_count', 'dev_ks', 'dev_ro_break',
                            'dev_ks_decile', 'dev_capture'
                        ]))
            else:
                break_dec = np.nan
                summary_df = summary_df.append(
                    pd.DataFrame(
                        [[len(nonflat), ks_val, break_dec, ks_decile, capture]
                         ],
                        columns=[
                            'feature_count', 'dev_ks', 'dev_ro_break',
                            'dev_ks_decile', 'dev_capture'
                        ]))

            for X_test, dset in zip(valsets, valnames):
                summary_df_test = pd.DataFrame()
                curr_X = X_test[nonflat]
                target = X_test[y]
                score = model.predict_proba(curr_X)
                ks = self.ksTable(
                    score[:, 1], target, dset + '_xgb_nonflat_pdp_' + version +
                    '_' + identifier + '_' + str(len(nonflat)))
                breaks = np.diff(ks['No.Res']) > 0
                dec_break = (np.diff(ks['No.Res']) > 0).any()
                ks_val = ks.KS.max()
                ks_decile = ks.KS.idxmax() + 1
                capture = ks['percent_cum_res'][3]
                if dec_break:
                    break_dec = min([idx
                                     for idx, x in enumerate(breaks) if x]) + 2
                    summary_df_test = summary_df_test.append(
                        pd.DataFrame([[
                            len(nonflat), ks_val, break_dec, ks_decile, capture
                        ]],
                                     columns=[
                                         'feature_count', dset + '_ks',
                                         dset + '_ro_break',
                                         dset + '_ks_decile', dset + '_capture'
                                     ]))
                else:
                    break_dec = np.nan
                    summary_df_test = summary_df_test.append(
                        pd.DataFrame([[
                            len(nonflat), ks_val, break_dec, ks_decile, capture
                        ]],
                                     columns=[
                                         'feature_count', dset + '_ks',
                                         dset + '_ro_break',
                                         dset + '_ks_decile', dset + '_capture'
                                     ]))

                summary_df_test.reset_index(drop=True, inplace=True)
                summary_df[dset + '_ks'] = summary_df_test[dset + '_ks']
                summary_df[dset + '_ro_break'] = summary_df_test[dset +
                                                                 '_ro_break']
                summary_df[dset + '_ks_decile'] = summary_df_test[dset +
                                                                  '_ks_decile']
                summary_df[dset + '_capture'] = summary_df_test[dset +
                                                                '_capture']
                summary_df['dev_' + dset + '_ks_diff'] = (
                    summary_df['dev_ks'] -
                    summary_df[dset + '_ks']) * 100 / summary_df['dev_ks']

            summary_df_pdp = summary_df_pdp.append(summary_df)

            nonflat_prev = nonflat
            if not os.path.exists(self.dictionary['path'] + '/' + 'PDP/' +
                                  version + '_' + identifier + '_' +
                                  str(len(nonflat))):
                os.makedirs(self.dictionary['path'] + '/' + 'PDP/' + version +
                            '_' + identifier + '_' + str(len(nonflat)))
            nonflat = self.generatePDP(
                model, X_train, nonflat,
                os.path.join(
                    save_loc, self.dictionary['path'] + '/' + 'PDP/' +
                    version + '_' + identifier + '_' + str(len(nonflat))))
            num_flat = len(set(nonflat_prev) - set(nonflat))
        summary_df_pdp.to_csv(self.dictionary['path'] + '/' +
                              'results/summary_df_nonflat_pdp_xgb_' + version +
                              '_' + identifier + '.csv',
                              index=False)
        logging.debug(
            "pdpvarreduction Module of XgbSelection Class executed successfully."
        )
        return nonflat
Example #51
0
    loo = LeaveOneOut()
    y_pred_list = []
    auc = []
    auc_train = []
    for train_index, test_index in loo.split(X):
        train_index = list(train_index)
        # print("%s %s" % (train_index, test_index))
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]
        model = XGBClassifier(max_depth=3, n_estimators=250, learning_rate=15 / 100,
                              #objective='multi:softmax',
                              objective='binary:logistic',
                              scale_pos_weight=(np.sum(y_train == -1) / np.sum(y_train == 1)),
                              reg_lambda=250)
        model.fit(X_train, y_train)
        pred_train = model.predict_proba(X_train)[:, 1]
        auc_train.append(metrics.roc_auc_score(y_train, pred_train))
        y_pred = model.predict_proba(X_test)[:, 1]
        y_pred_list.append(y_pred[0])
    try:
        auc = metrics.roc_auc_score(y, y_pred_list)
    except:
        pass
    scores = round(auc, 2)
    scores_train = round(np.array(auc_train).mean(), 2)
    train_accuracy.append(scores_train)
    test_accuracy.append(round(scores.mean(), 2))

train_accuracy_all = []
test_accuracy_all = []
def pca_graph(max_num_of_pcas = max_num_of_pcas):
Example #52
0
#import matplotlib.pyplot as plt
#
## summarize history for accuracy
##plt.plot(model.eval_metric['auc'])
#plt.plot(eval_metric['error'])
#plt.title('Error')
#plt.ylabel('error')
#plt.xlabel('n_trees')
#plt.legend(['train', 'test'], loc='upper left')
#
#plt.show()
##plt.savefig('/home/vljchr004/msc-hpc/feedforward_python/fig/feed_forward_2_history1.png', bbox_inches='tight')
#
#plt.close()
	
# make predictions for test data
y_pred = model.predict_proba(x_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

 	
# save model to file
pickle.dump(model, open("C:/Users/gerhard/Documents/msc-hpc/xgb/xgb0.pkl", "wb"))

np.savetxt("C:/Users/gerhard/Documents/msc-hpc/xgb/xgb0_preds.csv",y_pred,delimiter=", ")
np.savetxt("C:/Users/gerhard/Documents/msc-hpc/xgb/xgb0_y_test.csv",y_test,delimiter=", ")

Example #53
0
del df['TARGET']
# del df['ID']
id = df_test['ID']
# del df_test['ID']

pca = PCA(n_components=250)
train_pcaed = pca.fit_transform(df, target)

random_forest = RandomForestClassifier(n_estimators=30, max_depth=5, max_features=20)
random_forest.fit(train_pcaed, target)
forested = random_forest.predict_proba(train_pcaed)
# pipe = Pipeline(steps=[('pca', pca), ('random_forest', random_forest)])

m2_xgb = XGBClassifier(n_estimators=110, nthread=1, max_depth=4)
m2_xgb.fit(train_pcaed, target)
m2_xgbed = m2_xgb.predict_proba(train_pcaed)

logistic_regression = LogisticRegression(penalty='l1')
logistic_regression.fit(train_pcaed, target)
logistic_regressioned = logistic_regression.predict_proba(train_pcaed)

combined = np.concatenate([forested, m2_xgbed, logistic_regressioned], axis=1)


log_reg = LogisticRegression()
log_reg.fit(combined, target)

scores = cross_validation.cross_val_score(log_reg, combined, target,
                                              cv=5, scoring='roc_auc')
print(scores.mean(), scores)
Example #54
0
    print('-' * 53)

    print('Final Results')
    print('XGBOOST: %f' % xgboostBO.res['max']['max_val'])


    # Build and Run on the full data set K-fold times for bagging

    seeds = [1234, 5434, 87897, 123125, 88888]
    for seed_bag in seeds:
        X_train, X_valid, y_train, y_valid = train_test_split(train,
                                                              train_labels,
                                                              test_size=0.1,
                                                              random_state=seed_bag)
        clf = XGBClassifier(max_depth=int(xgboostBO.res['max']['max_params']['max_depth']),
                                               learning_rate=xgboostBO.res['max']['max_params']['learning_rate'],
                                               n_estimators=int(xgboostBO.res['max']['max_params']['n_estimators']),
                                               gamma=xgboostBO.res['max']['max_params']['gamma'],
                                               min_child_weight=xgboostBO.res['max']['max_params']['min_child_weight'],
                                               max_delta_step=xgboostBO.res['max']['max_params']['max_delta_step'],
                                               subsample=xgboostBO.res['max']['max_params']['subsample'],
                                               colsample_bytree=xgboostBO.res['max']['max_params']['colsample_bytree'],
                                               seed=seed_bag,
                                               objective="binary:logistic")

        clf.fit(X_train, y_train, eval_metric="auc", eval_set=[(X_valid, y_valid)], early_stopping_rounds=20)
        print('Prediction Complete')
        preds = clf.predict_proba(test)[:, 1]
        submission = submission = pd.DataFrame(preds, index=test_labels, columns=['target'])
        outfile_seed = '../output/xgb_autotune' + str(seed_bag) + '.csv'
        submission.to_csv(outfile_seed)