def build_prediction_model(path, percentage, para_tuning_mark, last_mark):
    # Read data
    if not last_mark:
        train = pandas.read_csv(path + "train_" + str(percentage))
        dev = pandas.read_csv(path + "dev_" + str(percentage))
        test = pandas.read_csv(path + "test_" + str(percentage))
    else:
        if percentage == 1.0:
            return
        train = pandas.read_csv(path + "train_" + str(percentage) + "_last")
        dev = pandas.read_csv(path + "dev_" + str(percentage) + "_last")
        test = pandas.read_csv(path + "test_" + str(percentage) + "_last")

    # Check whether there are any columns with all zeros
    nonzero_colums = train.loc[:, (train != 0).any(axis=0)].columns

    # Scale
    scale_pos_weight = {0: 0, 1: 0}
    for index, value in train['label'].iteritems():
        scale_pos_weight[value] += 1
    scale_value = scale_pos_weight[0] / float(scale_pos_weight[1])

    # Build prediction model
    predictors = [x for x in nonzero_colums if x not in ['label']]

    if para_tuning_mark:
        # Parameter turning guide:
        # https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
        # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

        # Parameter: learning_rate
        para_tuning_0(train, dev, test, scale_value)
        # para_tuning_1(train, dev, test, scale_value)
        # para_tuning_2(train, dev, test, scale_value)
        # para_tuning_3(train, dev, test, scale_value)
        # para_tuning_4(train, dev, test, scale_value)

    else:

        xgb = XGBClassifier(learning_rate=0.015,
                            n_estimators=686,
                            max_depth=9,
                            min_child_weight=5,
                            gamma=0.0,
                            subsample=0.8,
                            colsample_bytree=0.8,
                            reg_alpha=0.01,
                            objective='binary:logistic',
                            nthread=4,
                            scale_pos_weight=scale_value,
                            seed=27)

        xgb.fit(train[predictors], train['label'], eval_metric='auc')
        dtest_predprob = xgb.predict_proba(test[predictors])[:, 1]

        print(
            "AUC/F1 Score/Kappa (Test):\t%f\t%f\t%f\t" %
            (metrics.roc_auc_score(test['label'], dtest_predprob),
             metrics.f1_score(test['label'], dtest_predprob.round()),
             metrics.cohen_kappa_score(test['label'], dtest_predprob.round())))
Ejemplo n.º 2
0
def train(X_train, X_test, y_train, y_test):
    xgb = XGBClassifier(
        learning_rate=0.1,
        n_estimators=20,
        max_depth=4,
        min_child_weight=1,
        gamma=1,
        subsample=0.6,
        colsample_bytree=0.6,
        objective='binary:logistic',
        scale_pos_weight=1,
        nthread=4,
        max_delta_step=10,
        #scale_pos_weight=1,
        seed=27,
        cv=3,
        reg_alpha=0.01,
        eval_metric="error")

    print(X_train.shape)
    #print("修改前")
    # print(y_train.shape[1])
    # xgb1.set_params(params)
    xgb.fit(X_train, y_train)
    model_metrics(xgb, X_train, X_test, y_train, y_test)
    return xgb
Ejemplo n.º 3
0
def prediction():
    xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
    traindf, testdf = train_test_split(X_train, test_size = 0.3)
    xgb.fit(X_train,y_train)
    predictions = xgb.predict(X_test)
    print(explained_variance_score(predictions,y_test))    
Ejemplo n.º 4
0
def XGBoost_classifier(X_train, train_target, X_test):
    X_test = X_test.values

    xgb = XGBClassifier()
    xgb.fit(X_train, train_target)
    hyp = xgb.predict(X_test)
    return hyp
Ejemplo n.º 5
0
def xgboost(train_x,train_y,test_x,test_y):
    import xgboost as xgb
    xgb = xgb.XGBClassifier(n_estimators=150,min_samples_leaf=3,max_depth=6)
    xgb.fit(train_x,train_y)
    y_pred = xgb.predict(test_x)
    print(classification_report(test_y,y_pred))
    print(confusion_matrix(test_y,y_pred))
    print('gbdt accuracy is', accuracy_score(test_y,y_pred))
Ejemplo n.º 6
0
def train(ite):
    print(i)
    data = train_target_0.sample(700)  #数据显示1 :0 = 17:2(》0.5)
    data = data.append(train_target_1)
    y_ = data.target
    del data['target']
    xgb.fit(data, y_)
    #    train_p[ite] = xgb.predict(train_data)
    res[ite] = xgb.predict_proba(test_data)[:, 1]
Ejemplo n.º 7
0
    def XGBscore(self):

        X_train_leaves = self.x_train
        y_train = self.y_train
        X_test_leaves = self.x_test
        y_test = self.y_test
        xgb = XGBClassifier()
        xgb.fit(X_train_leaves, y_train)
        Y_pred_xgb = xgb.predict(X_test_leaves)
        xgb_auc = roc_auc_score(y_test, Y_pred_xgb)
        print('GBDT + XGB auc: %.5f' % xgb_auc)
Ejemplo n.º 8
0
def test_xgboost_sklearn_gressor():
    l1 = []
    from sklearn.datasets import load_boston
    boston = load_boston()
    xgb = XGBRegressor()
    xgb.fit(boston.data, boston.target)

    predictions = xgb.predict(boston.data)
    l1 += predictions.tolist()
    print(predictions)
    print(type(predictions))
def train_xgb(data):
    X = data.drop(['cause'], axis=1).values
    Y = data['cause'].values
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=0)
    xgb = XGBClassifier(n_estimators=300)
    xgb.fit(X_train, y_train)
    preds = xgb.predict(X_test)
    acc_xgb = (preds == y_test).sum().astype(float) / len(preds) * 100
    print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))
Ejemplo n.º 10
0
def check_performance(g_z,
                      train_data,
                      test_data,
                      data_cols,
                      label_cols=[],
                      seed=0,
                      with_class=False,
                      data_dim=2):
    #train_data,test_data = load_preprocess_aps_data()
    if len(label_cols) > 0:
        gen_df = pd.DataFrame(g_z[:, :-1], columns=data_cols)
    else:
        gen_df = pd.DataFrame(g_z, columns=data_cols)

    gen_df['failure'] = np.ones((g_z.shape[0], 1))
    combined_train_df = pd.concat([train_data, gen_df])
    print(train_data.shape, gen_df.shape, combined_train_df.shape)
    xgb_params = {
        # 'tree_method': 'hist', # for faster evaluation
        'max_depth': 3,  # for faster evaluation
        'n_estimators': 18,
        #'objective': 'binary:logistic',
        'random_state': 0,
        #'eval_metric': 'auc',  # allows for balanced or unbalanced classes
        'scale_pos_weight': 40,
        'min_child_weight': 44,
        'silent': 1
    }

    X_train = combined_train_df[combined_train_df.columns.drop(
        'failure')].values
    y_train = combined_train_df.failure

    X_test = test_data[test_data.columns.drop('failure')].values
    y_test = test_data.failure

    xgb = XGBClassifier(max_depth=3,
                        n_estimators=18,
                        n_jobs=-1,
                        scale_pos_weight=40,
                        min_child_weight=44)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)

    # dtrain = xgb.DMatrix(X_train, y_train, feature_names=data_cols + label_cols)
    # dtest = xgb.DMatrix(X_test, feature_names=data_cols + label_cols)
    # xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=10)  # limit to ten rounds for faster evaluation
    #
    # y_pred = np.round(xgb_test.predict(dtest))
    print('Test performance confusion', confusion_matrix(
        y_test, y_pred))  # assumes balanced real and generated datasets
    return aps_cost(y_pred,
                    y_test)  # assumes balanced real and generated datasets
Ejemplo n.º 11
0
def main():
    ## dummy test for code working till model class
    preprocessing = Preprocessing(
        config['weather_file_path'], config['fire_data_file_path'],
        ['Datetime'], [
            'dt_iso', 'temp', 'pressure', 'humidity', 'wind_speed', 'wind_deg',
            'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h', 'clouds_all'
        ])
    preprocessing.start_preprocessing()
    data = Data(preprocessing.joined_data)
    train_x, train_y, test_x, test_y = data.get_train_test()
    xgb = Model((train_x, train_y, test_x, test_y))
    xgb.fit()
    predictions, target = xgb.make_predict()
Ejemplo n.º 12
0
def fonction_model_xgb(data):
   
    df = fonction_select_xgb(data)
   
    X = df.drop("tx_rec_marg_Bin",axis = 1)
    y = df["tx_rec_marg_Bin"]
   
    X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.3,random_state = 0)

    kf = KFold(n_splits=3)  
    kf.get_n_splits(X)



    Quant=df[[col for col in df.columns.to_list() if df[col].nunique() > 3]]
   
    num = list(Quant.columns)
   

    scaler = StandardScaler().fit(X_train[num])
    X_train[num] = scaler.transform(X_train[num])
    X_test[num]  = scaler.transform(X_test[num])
   

    xgb = XGBClassifier(criterion = 'gini',max_depth = 7, max_features = 'auto', n_estimators = 500)
   

    #grid_xgb = GridSearchCV (estimator = xgb, param_grid=param_grid ,scoring="accuracy")

    #print(grid_rf.best_params_)

    xgb.fit(X_train, y_train)


    y_pred = xgb.predict(X_test)

    print(classification_report(y_test,y_pred))
    #print(grid_xgb.best_params_)
   
    xgb_shap = XGBClassifier(criterion = 'gini',max_depth = 7, max_features = 'auto', n_estimators = 500)
   
    xgb_shap.fit(X_train, y_train)
    shap_values = shap.TreeExplainer(xgb_shap).shap_values(X_train)

    print(shap.summary_plot(shap_values, X_train, plot_type="bar"))
    print(confusion_matrix(y_test,y_pred))
    print(f1_score(y_pred,y_test, average='micro'))
    
    return data
def XGBoost(X_train,X_test,Y_train,Y_test):
    xgb = XGBClassifier()

    # Fitting the model
    xgb_model = xgb.fit(X_train, Y_train)

    # Predicting results
    y_pred = xgb_model.predict(X_test)
    
    #Evaluation
    model_train_score = xgb_model.score(X_train,Y_train)
    model_test_score = xgb_model.score(X_test,Y_test)
    conf_matrix = confusion_matrix(Y_test, y_pred)
    print ('XGBoost MODEL TEST SCORE: {0:.5f}'.format(model_train_score))
    print ('XGBoost MODEL TEST SCORE: {0:.5f}'.format(model_test_score))
    print("XGBoost ACCURACY: {0:.2f}".format(accuracy_score(Y_test, y_pred)))
    print("XGBoost ROC-AUC: {0:.2f}".format(roc_auc_score(Y_test, y_pred)))
    print("XGBoost PRECISION: {0:.2f}".format(precision_score(Y_test, y_pred)))
    print("XGBoost RECALL: {0:.2f}".format(recall_score(Y_test, y_pred)))
    print("XGBoost Confusion Matrix:\n",conf_matrix)
    print ('\nXGBoost True Negatives: ', conf_matrix[0,0])
    print ('XGBoost False Negatives: ', conf_matrix[1,0])
    print ('XGBoost True Positives: ', conf_matrix[1,1])
    print ('XGBoost False Positives: ', conf_matrix[0,1])
    return xgb_model
    def runXGBoost(train_data_mix_n, train_Y, test_data_mix_n, test_Y):
        xgb = XGBClassifier(max_depth=10,
                            min_child_weight=6,
                            gamma=0.5,
                            colsample_bytree=0.7,
                            subsample=0.7,
                            reg_alpha=1)

        xgb.fit(train_data_mix_n, train_Y)

        predicted_label = xgb.predict(test_data_mix_n)
        print("Test accuracy using XGBoost Classifier")
        print(accuracy_score(test_Y, predicted_label))
        print("Confusion Metrix for XGBoost Classifier..")
        cnf_matrix = confusion_matrix(test_Y, predicted_label)
        print(cnf_matrix)
Ejemplo n.º 15
0
def movie_model_save(variable, Data):

    pd.options.mode.chained_assignment = None

    contents = pd.DataFrame(Data, columns = ['PAYMENT', 'PROGRAM_TYPE', 'New_Contents', 'genre_Label', 'target_age', 'playtime', 'channel_Label', 'contentnumber', 'episode_count', 'past_view'])
    x_train = contents[contents['New_Contents'] == 0]
    x_train.loc[:,'PROGRAM_TYPE'] = round(x_train.loc[:,'PROGRAM_TYPE'])
    x_train = x_train[x_train['PROGRAM_TYPE'] != x_train.PROGRAM_TYPE.unique()[0]]
    x_train.contentnumber = x_train.contentnumber.fillna(0)
    x_train.episode_count = x_train.episode_count.fillna(1)
    x_train = x_train.drop('New_Contents', axis = 1)
    x_train = x_train.drop('PROGRAM_TYPE', axis = 1)
    x_train = x_train.values
    x_train = x_train.astype('float32')
    scaler = MinMaxScaler(feature_range=(0, 1))
    x_train = scaler.fit_transform(x_train)

    contents = pd.DataFrame(Data, columns = ['PROGRAM_TYPE', 'New_Contents', 'ViewCount'])
    y_train = contents[contents['New_Contents'] == 0]
    y_train = y_train.drop('New_Contents', axis = 1)
    y_train.loc[:,'PROGRAM_TYPE'] = round(y_train.loc[:,'PROGRAM_TYPE'])
    y_train = y_train[y_train['PROGRAM_TYPE'] != y_train.PROGRAM_TYPE.unique()[0]]
    y_train = y_train.drop('PROGRAM_TYPE', axis = 1)
    y_train = y_train.values
    y_train = y_train.astype('float32')

    import xgboost as xgb

    xgb = xgb.XGBRegressor(colsample_bytree = 1,
     learning_rate =0.4,
     n_estimators=1000,
     max_depth=8,
     min_child_weight=1,
     max_delta_step = 2.5,
     gamma=1.0,
     subsample=0.8,
     objective = 'reg:linear',
     n_jobs=8,
     scale_pos_weight=1.8,
     random_state=27,
     base_score = 0.5)

    xgb.fit(x_train, y_train)

    filename = './data/finalized_model_movie.sav'
    pickle.dump(xgb, open(filename, 'wb'))
Ejemplo n.º 16
0
def xgb_model_1(X_train,y_train,X_test,params=None):
    # train with the scikit-learn API
    xgb = XGBRegressor(n_estimators=1000, max_depth=13, min_child_weight=150, 
                   subsample=0.7, colsample_bytree=0.3)
    y_test = np.zeros(len(X_test))
    for i, (train_ind, val_ind) in enumerate(KFold(n_splits=2, shuffle=True, 
                                            random_state=1989).split(X_train)):
        print("----------------------")
        print("Training model #%d" % i)
        print("----------------------")
        # XGBRegressor.fit 
        xgb.fit(X_train[train_ind], y_train[train_ind],
                eval_set=[(X_train[val_ind], y_train[val_ind])],
                early_stopping_rounds=10, verbose=25)
        
        y_test += xgb.predict(X_test, ntree_limit=xgb.best_ntree_limit)
    y_test /= 2

    return y_test
Ejemplo n.º 17
0
def pred():
    xgb = XGBClassifier(booster='gbtree', gamma=0.0, max_depth=8, min_child_weight=3, learning_rate=0.03, n_jobs=-1,
                        scale_pos_weight=1, reg_alpha=0.1, reg_lambda=1, colsample_bytree=0.9, subsample=0.8,
                        n_estimators=370, objective="binary:hinge", tree_method='gpu_hist', gpu_id=0,
                        random_state=5477113)
    xgb.fit(x_combined, y_combined)
    y_pred = xgb.predict(X_test)
    team_name = 'TeamFOSAI'
    submission_index = 2
    label_file = '/media/jose/hk-data/PycharmProjects/the_speech/data/mask/labels/labels.csv'
    df_labels = pd.read_csv(label_file)
    # Write out predictions to csv file (official submission format)
    pred_file_name = task + '.' + feat_type +'.test.' + team_name + '_' + str(submission_index) + '.csv'
    print('Writing file ' + pred_file_name + '\n')
    df = pd.DataFrame(data={'file_name': df_labels['file_name'][df_labels['file_name'].str.startswith('test')].values,
                            'prediction': le.inverse_transform(y_pred).flatten()},
                      columns=['file_name','prediction'])
    df.to_csv(pred_file_name, index=False)

    print('Done.\n')
def train_xgboost(df_train, features, target, save_model=False, cv=False):

    # Numerate feature strings for modeling and save feature_ids.
    feature_ids = {}
    for col in features + ['Ad']:
        if df_train[col].dtype == "object":
            catigories = list(df_train[col].unique())
            df_train[col] = df_train[col].apply(
                lambda cat: catigories.index(cat))
            feature_ids[col] = {
                cat: catigories.index(cat)
                for cat in catigories
            }

    # Fit Gradient Boosted decision model.
    X = df_train[features + ['Ad']].as_matrix()
    y = df_train[target].as_matrix()

    # Declare XGboost model.
    xgb = XGBClassifier(learning_rate=0.8,
                        n_estimators=54,
                        max_depth=5,
                        min_child_weight=1,
                        gamma=0.2,
                        subsample=0.8,
                        colsample_bytree=0.75,
                        reg_alpha=15.25,
                        objective='binary:logistic')

    # Fit the model on the data
    xgb.fit(X, y, eval_metric='auc')

    # Print 5-fold cross validation scores if cv=True.
    if cv:
        cv_scores = cross_val_score(xgb, X, y, cv=5)
        print 'cross_val_scores:', cv_scores, cv_scores.mean()

    return xgb, feature_ids
def xgboost_param_solution():
    xgb=XGBoostClassifier(alpha=0, booster='gbtree', colsample_bytree=0.459971793632,
         early_stopping_rounds=30, eta=0.0305648288294,
         eval_metric='mlogloss', gamma=0.0669039612464, l=0, lambda_bias=0,
         max_delta_step=4, max_depth=14, min_child_weight=8, nthread=4,
         ntree_limit=0, num_class=9, num_round=1000,
         objective='multi:softprob', seed=84425, silent=0,
         subsample=0.972607582489, use_buffer=True)

    train=load_data('train.csv')
    test=load_data('test.csv')
    le = preprocessing.LabelEncoder()
    le.fit(train['target'])
    train['target']=le.transform(train['target'])
    feature_cols= [col for col in train.columns if col  not in ['target','id']]
    X_train=train[feature_cols]
    X_test=test[feature_cols]
   
    y=train['target']
    test_ids=test['id']
    
    xgb.fit(X_train, y)
    preds=xgb.predict_proba(X_test)
    write_submission(test_ids,preds,'submissions/xgboost_param_solution_76.csv')
Ejemplo n.º 20
0
def trainxgb(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)

    random_state=random.randint(0, 1000000)
    print('random state: {state}'.format(state=random_state))

    xgb = XGBoostClassifier(base_estimator='gbtree',
                 objective='multi:softprob',
                 metric='mlogloss',
                 num_classes=9,
                 learning_rate=random.uniform(0.01,0.05),
                 max_depth=random.randint(10,20),
                 max_samples=random.uniform(0.0,1.0),
                 max_features=random.uniform(0.0,1.0),
                 max_delta_step=random.randint(1,10),
                 min_child_weight=random.randint(1,10),
                 min_loss_reduction=1,
                 l1_weight=0.0,
                 l2_weight=0.0,
                 l2_on_bias=False,
                 gamma=0.02,
                 inital_bias=random.uniform(0.0,1.0),
                 random_state=random_state,
                 watchlist=[[valid_x,valid_y]],
                 n_jobs=30,
                 n_iter=3000,
                )

    xgb.fit(train_x, train_y)

    valid_predictions = xgb.predict_proba(valid_x)

    if test(valid_y,valid_predictions) <0.450:
        test_predictions= xgb.predict_proba(test_x)
        data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv")
        data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
Ejemplo n.º 21
0
def run_cv(x_train, x_test, y_train, y_test):
    x_train = x_train
    conf.xgb_config()
    tic = time.time()
    data_message = 'X_train.shape={}, X_test.shape = {}'.format(
        np.shape(x_train), np.shape(x_test))
    print(data_message)
    xgb = XGBooster(conf)
    best_auc, best_round, cv_rounds, best_model = xgb.fit(x_train, y_train)
    print('Training time cost {}s'.format(time.time() - tic))
    xgb.save_model()
    result_message = 'best_auc = {}, best_round = {}'.format(
        best_auc, best_round)
    print(result_message)

    # now = time.strftime('%Y-%m-%d %H:%M')
    result_saved_path = '../result/result_{}-{:.4f}.csv'.format(now, best_auc)
    xgb_predict(best_model, x_test, y_test, save_result_path=result_saved_path)
Ejemplo n.º 22
0
def run_cv(x_train, x_test, y_train, y_test, regress_conf):
    x_train = x_train
    tic = time.time()
    data_message = 'X_train.shape={}, X_test.shape = {}'.format(
        np.shape(x_train), np.shape(x_test))
    log.logger.info(data_message)
    xgb = XGBooster(regress_conf)
    best_score, best_round, best_model = xgb.fit(x_train, y_train)
    log.logger.info('Training time cost {}s'.format(time.time() - tic))
    # xgb.save_model()
    result_message = 'best_score = {}, best_round = {}'.format(
        best_score, best_round)
    log.logger.info(result_message)

    # now = time.strftime('%Y-%m-%d %H:%M')
    result_saved_path = '../result/result_{}-{:.4f}.csv'.format(
        now, best_round)
    # xgb_predict(best_model, x_test, y_test, result_save_path=result_saved_path)
    xgb_predict(best_model, x_test, y_test, result_save_path=None)
Ejemplo n.º 23
0
def select_features_from_xgb(features,labels,test_feature):

    print("\nStart selecting importance features")
    xgb = XGBClassifier(n_estimators=2, max_depth=4, learning_rate = 0.07, subsample = 0.8, colsample_bytree = 0.9)
    xgb = xgb.fit(features, labels)
    importances = xgb.feature_importances_
    indices = np.argsort(importances)[::-1]

    model = SelectFromModel(xgb, prefit=True)
    features_new = model.transform(features)
    test_feature_new = model.transform(test_feature)
    with open(data_path + "importance_features.txt" , "w") as log:
        for f in range(features_new.shape[1]):
            log.write(str(f + 1) + "." +  " feature " +  str(indices[f]) + "  " + str(importances[indices[f]]) + "\n")
            #print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    print("Features selection done saved new data in data path")
    
    sel = VarianceThreshold(threshold = (.8 * (1 - .8)))
    sel.fit_transform(features)
    
    return features_new, test_feature_new
Ejemplo n.º 24
0
    x_test_stacking.iloc[i, 3] = x_test_stacking.iloc[i, 3] + pred_rf[i]
    x_test_stacking.iloc[i, 4] = x_test_stacking.iloc[i, 4] + pred_knn[i]
#------------------------------------------------------------------
#####################对测试集结果进行平均化处理#####33
print(x_test_stacking)

for i in range(len(x_test)):
    for j in range(5):
        x_test_stacking.iloc[i, j] = x_test_stacking.iloc[i, j] / 3

print(x_test_stacking)

###################################第二层用xgboost#############
xgb = XGBRegressor(max_depth=4,
                   learning_rate=0.005,
                   n_estimators=500,
                   silent=True,
                   objective='reg:linear',
                   subsample=0.93,
                   base_score=y_mean,
                   seed=0,
                   missing=None)
xgb.fit(x_train_stacking, y_train)
pred = xgb.predict(x_test_stacking)
print(pred)

output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred})
output.to_csv(
    'C:\\Users\\Administrator\\Desktop\\benz\\new\\test_stacking.csv',
    index=False)
Ejemplo n.º 25
0
# ## XG BOOST ON SMOTE

# In[127]:

import xgboost as xgb

# In[128]:

from xgboost import XGBClassifier
tree_range = range(2, 30, 5)
score1 = []
score2 = []
for tree in tree_range:
    xgb = XGBClassifier(n_estimators=tree)
    xgb.fit(X_smote, y_smote)
    score1.append(xgb.score(X_smote, y_smote))
    score2.append(xgb.score(X_test, y_test))

get_ipython().run_line_magic('matplotlib', 'inline')
plt.plot(tree_range, score1, label='Accuracy on training set')
plt.plot(tree_range, score2, label='Accuracy on testing set')
plt.xlabel('Value of number of trees in XGboost')
plt.ylabel('Accuracy')
plt.legend()

# As we can see accuracy is increasing for the test and stabilizes at one point

# In[129]:

xgb = XGBClassifier(n_estimators=18)
Ejemplo n.º 26
0
toc6 = time.time()
print('Elapsed time for Neural network is %f seconds \n' % float(toc6 - tic6))

#--------------- XGBoost algorithm
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
tic7 = time.time()
xgb = XGBClassifier(objective='multi:softmax',
                    num_class=4,
                    n_fold=4,
                    colsample_bytree=1,
                    learning_rate=0.15,
                    max_depth=5,
                    n_estimators=600,
                    subsample=0.3)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)

# Get f1 score
f1_xgb = f1_score(y_test, y_pred, average='weighted')

# Append to the accuracy list
#accuracy_lst.append(acc)
#f1_lst.append(f1_xgb)

print("[XGBoost algorithm] accuracy_score: {:.3f}.".format(acc))
print("[XGBoost algorithm] f1_score: {:.3f}.".format(f1_xgb))
Ejemplo n.º 27
0
dtest = xgb.DMatrix(x_test)
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,verbose_eval=50, show_stdv=False)
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()

num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)

xgb.plot_importance(model, height=0.5)

num_boost_round = model.best_iteration
xgb.plot_importance(model,  height=0.5)

from xgboost.sklearn import XGBRegressor
xgb = XGBRegressor( nthread=-1,  missing= -1, n_estimators=300, learning_rate=0.02, max_depth=17, subsample=0.9
                   , min_child_weight=3, colsample_bytree=0.7, reg_alpha=100, reg_lambda=100, silent=False)
xgb.fit(x_train,y_train)
#print(x_train)
pred=xgb.predict(x_test)
predictions = [round(value) for value in pred]
"""x_test['result']=pred
x_test['crop']=y_test
x_test.to_csv('pred.csv')"""
#print accuracy_score(y_test,pred)
accuracy = accuracy_score(y_test, predictions)
  
print("Accuracy: %.2f%%" % (accuracy * 100.0))
@app.route('/predictor',methods=['POST','GET'])
def predictor():
   data=request.get_json(force=True)
   a=str(data.get("rain"))
   b=str(data.get("temperature"))
Ejemplo n.º 28
0
    'Supermarket Type3': 2,
    'Supermarket Type2': 1
}
datatest.Outlet_Type = [gender[item] for item in datatest.Outlet_Type]

datatest.head()

#usig Randome forest Regresser
regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=500)
regr.fit(data[data.columns[1:6]], data["Item_Outlet_Sales"])
datat.dtypes
pred = regr.predict(datatest[datatest.columns[1:6]])

#using Xgboost
xgb = xgb.XGBRegressor(n_estimators=50,
                       learning_rate=0.09,
                       gamma=0,
                       subsample=0.85,
                       colsample_bytree=1,
                       max_depth=7)
xgb.fit(data[data.columns[1:6]], data["Item_Outlet_Sales"])
pred = xgb.predict(datatest[datatest.columns[1:6]])

datat["Item_Outlet_Sales"] = pred
newdf = datat[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']]
newdf.to_csv("D://24projects//Project 3//output.csv",
             encoding='utf-8',
             index=False)

datat["Item_Outlet_Sales"].isnull().sum()
features1 = scaler.fit_transform(features1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features1, target, test_size=0.1, random_state=42)
from sklearn import svm
svm1 = svm.SVC()
svm1.fit(X_train, y_train)  
predictionssvm = svm1.predict(X_test)
dtc=DecisionTreeClassifier()
modeldtc = dtc.fit(X_train, y_train)
predictionsdtc = dtc.predict(X_test)
adb=AdaBoostClassifier()
modeladb=adb.fit(X_train, y_train)
predictionsadb = adb.predict(X_test)
from sklearn.ensemble import GradientBoostingClassifier
xgb= GradientBoostingClassifier()
modelxbg=xgb.fit(X_train, y_train)
predictionsxgb = xgb.predict(X_test)
import operator
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier(solver='adam',activation='tanh',random_state=0)
modelmlp=mlp.fit(X_train,y_train)
predictionmlp=mlp.predict(X_test)

#4. Stacked Classifier
X=features1
y=target
clf1 = adb
clf2 = dtc
clf3 = svm1
meta = LogisticRegression()
    def calculateRankMatrix(train_data_mix, train_Y, colnames, threshold):
        ranks = {}

        clf = ExtraTreesClassifier()
        clf = clf.fit(train_data_mix, train_Y)

        ranks["tree"] = YoungPeopleEmpathy.ranking(clf.feature_importances_,
                                                   colnames)

        xgb = XGBClassifier(max_depth=10,
                            min_child_weight=8,
                            gamma=0.7,
                            colsample_bytree=0.7,
                            subsample=0.7,
                            reg_alpha=1)

        xgb = xgb.fit(train_data_mix, train_Y)

        ranks["xgb"] = YoungPeopleEmpathy.ranking(xgb.feature_importances_,
                                                  colnames)

        ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                                 n_estimators=600,
                                 learning_rate=1)
        ada = ada.fit(train_data_mix, train_Y)

        ranks["ada"] = YoungPeopleEmpathy.ranking(ada.feature_importances_,
                                                  colnames)

        model = LogisticRegression()
        # create the RFE model and select 3 attributes
        rfe = RFE(model, 3)
        rfe = rfe.fit(train_data_mix, train_Y)
        #column names sorted by ranking
        ranks["RFE"] = YoungPeopleEmpathy.ranking(list(map(
            float, rfe.ranking_)),
                                                  colnames,
                                                  order=-1)

        rf = RandomForestClassifier(bootstrap=True,
                                    class_weight=None,
                                    criterion='gini',
                                    max_depth=10,
                                    max_features='auto',
                                    max_leaf_nodes=None,
                                    min_impurity_decrease=0.0,
                                    min_impurity_split=None,
                                    min_samples_leaf=1,
                                    min_samples_split=2,
                                    min_weight_fraction_leaf=0.0,
                                    n_estimators=100,
                                    n_jobs=1,
                                    oob_score=False,
                                    random_state=None,
                                    verbose=0,
                                    warm_start=False)
        rf.fit(train_data_mix, train_Y)
        ranks["RF"] = YoungPeopleEmpathy.ranking(rf.feature_importances_,
                                                 colnames)

        r = {}
        for name in colnames:
            r[name] = round(
                np.mean([ranks[method][name] for method in ranks.keys()]), 2)

        methods = sorted(ranks.keys())
        ranks["Mean"] = r
        methods.append("Mean")

        #print("\t%s" % "\t".join(methods))
        #for name in colnames:
        #   print("%s\t%s" % (name, "\t".join(map(str,
        #                        [ranks[method][name] for method in methods]))))
        # Put the mean scores into a Pandas dataframe
        meanplot = pd.DataFrame(list(r.items()),
                                columns=['Feature', 'Mean Ranking'])

        # Sort the dataframe
        meanplot = meanplot.sort_values('Mean Ranking', ascending=False)

        return meanplot
Ejemplo n.º 31
0
import xgboost as xgb
from xgboost import XGBRegressor

xgb = XGBRegressor(learning_rate=0.01,
                   n_estimators=3460,
                   max_depth=3,
                   min_child_weight=0,
                   gamma=0,
                   subsample=0.7,
                   colsample_bytree=0.7,
                   objective='reg:linear',
                   nthread=4,
                   scale_pos_weight=1,
                   seed=27,
                   reg_alpha=0.00006)
xgb_fit = xgb.fit(x_train, y_train)

#----------------------svm
from sklearn import svm

svr_opt = svm.SVR(C=100000, gamma=1e-08)
svr_fit = svr_opt.fit(x_train, y_train)
cv_rmse(svr_fit).mean

#-----------------LGBMRegressor
from lightgbm import LGBMRegressor

lgbm_model = LGBMRegressor(objective='regression',
                           num_leaves=5,
                           learning_rate=0.05,
                           n_estimators=720,
'''
{'colsample_bytree': 0.5,
 'gamma': 0.15,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 6,
 'n_estimators': 27,
 'subsample': 0.45}
'''

xgb.best_score_ # 0.83585339132974634

xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=27,
                    objective='multi:softprob', subsample=0.4, colsample_bytree=0.5, seed=0)  

xgb.fit(X, y)
xgb_predictions = xgb.predict_proba(X_test)

# Put these in a good form to spit out
xgb_predictions = xgb_predictions.ravel()

# Have to ensure these are in the same order, yep, looks good
classes = np.tile(xgb.classes_, X_test.shape[0])
ids = np.repeat(test["id"].values, 12)

print(xgb_predictions.shape)
print(classes.shape)
print(ids.shape)
print(test_users['id'].shape)
print(test['id'].shape)
Ejemplo n.º 33
0
import pandas as pd 
dataset= pd.read_csv('C:/Users/Riahi/Desktop/PROJET_PATIE_ML/BD_Projet_NEW112- Copie.csv', delimiter=';') 
price=dataset['COST']
Data=dataset.drop(['COST'],axis=1)

x=np.array(price).reshape(-1,1)

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(Data,x,test_size=0.33,random_state=0)

import xgboost as xgb

xgb = xgb.XGBRegressor(n_estimators=10, max_depth=5, objectives = 'reg:linear' , learning_rate=0.3)
import time
dep=time.time()
xgb.fit(X_train,Y_train)
fin=time.time()-dep
predictions = xgb.predict(X_test)
from sklearn.metrics import mean_squared_error
rmse=np.sqrt(mean_squared_error(Y_test,predictions))
print("RMSE: %f" % (rmse))
from sklearn.metrics import explained_variance_score
EV=explained_variance_score(Y_test,predictions)
print("EV : %f" %(EV))

import matplotlib.pyplot as plt
import os
os.getcwd()
os.chdir('C:/Program Files (x86)/Graphviz2.38/bin')
xgb.plot_tree(xgb,num_trees=9)
plt.rcParams['figure.figsize'] = [50, 10]
T_train_sample_xgb = xgb.DMatrix(X_train_sample, Y_train_sample)
X_test_sample_xgb = xgb.DMatrix(X_test_sample)


xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=200,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
#scores:  XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=50
#0.8183974444336767 121 rounds
Y_test_sample = test_sample["country_destination"]
Y_test_sample = Y_test_sample.map(country_num_dic)

X_train_sample.isnull().sum()

eval_set  = [(X_train_sample, Y_train_sample), (X_test_sample, Y_test_sample)]
xgb.fit(X_train_sample, Y_train_sample, eval_set = eval_set, eval_metric = 'mlogloss', early_stopping_rounds= 10)
Y_pred_sample = xgb.predict_proba(X_test_sample)


y_le_train_sample = (train_sample['country_destination'].map(country_num_dic)).values
y_le_test_sample = (test_sample['country_destination'].map(country_num_dic)).values
y_le_train = (train['country_destination'].map(country_num_dic)).values

id_train = train['id'].values
id_train_sample = train_sample['id'].values
id_test_sample = test_sample['id'].values
id_test = test['id'].values


#------------- TRAIN SAMPLE PREDICTION --------------------------
ids = []  #list of ids