Ejemplo n.º 1
0
def ModelPredict(para):

    # para=[7.16, 0.4, 0.31,0.9,2.7,1.48, 0.78, 0.86]
    data = pd.DataFrame(columns=('R', 'angle', 'occusion', 'score'))
    print("预测开始:")

    for i in range(int(len(para) / 4)):
        print(i * 4)
        data.loc[i] = para[i * 4:i * 4 + 4]
    print(data)
    y_test = data.pop('score')
    x_test = data
    print(x_test)
    print(y_test)
    cab, lgb, xgb, gbdt, stack_lr = LoadModel()

    print("加载完毕:")
    y_pred_cab_test = cab.predict(x_test)
    y_pred_lgb_test = lgb.predict(x_test)
    y_pred_xgb_test = xgb.predict(x_test)
    y_pred_gbdt_test = gbdt.predict(x_test)

    print("stack")
    stack_x_test = pd.DataFrame()
    stack_x_test['Method_1'] = y_pred_cab_test
    stack_x_test['Method_2'] = y_pred_lgb_test
    stack_x_test['Method_3'] = y_pred_xgb_test
    stack_x_test['Method_4'] = y_pred_gbdt_test
    stack_pred = stack_lr.predict(stack_x_test)
    print("stack_mae:",
          mean_absolute_error(y_test, stack_pred))  #mae:2.1501818709279975
    print(stack_pred.tolist())
    return stack_pred.tolist()
Ejemplo n.º 2
0
def prediction():
    xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
    traindf, testdf = train_test_split(X_train, test_size = 0.3)
    xgb.fit(X_train,y_train)
    predictions = xgb.predict(X_test)
    print(explained_variance_score(predictions,y_test))    
Ejemplo n.º 3
0
    def inference(self, spec):
        #データの前処理
        X = spec.drop(['price', '詳細情報'], axis=1).as_matrix()
        y = spec['price']
        indices = spec.index

        #モデルのロード
        xgb = self.load_model()

        #推論
        prediction = xgb.predict(X)

        #結果の比較
        error = y - prediction
        error_per = abs(error) / y * 100
        result = pd.DataFrame(
            {
                'prediction': prediction,
                'error': error,
                'error_percent': error_per,
            },
            index=indices)
        result_all = pd.concat([spec, result], axis=1)
        treasure = result_all[(result_all['error_percent'] > 5)
                              & (result_all['error'] < 0)]

        #         #結果を出力
        #         print(result_all)
        return treasure
Ejemplo n.º 4
0
def XGBoost_classifier(X_train, train_target, X_test):
    X_test = X_test.values

    xgb = XGBClassifier()
    xgb.fit(X_train, train_target)
    hyp = xgb.predict(X_test)
    return hyp
Ejemplo n.º 5
0
def xgboost(train_x,train_y,test_x,test_y):
    import xgboost as xgb
    xgb = xgb.XGBClassifier(n_estimators=150,min_samples_leaf=3,max_depth=6)
    xgb.fit(train_x,train_y)
    y_pred = xgb.predict(test_x)
    print(classification_report(test_y,y_pred))
    print(confusion_matrix(test_y,y_pred))
    print('gbdt accuracy is', accuracy_score(test_y,y_pred))
Ejemplo n.º 6
0
    def XGBscore(self):

        X_train_leaves = self.x_train
        y_train = self.y_train
        X_test_leaves = self.x_test
        y_test = self.y_test
        xgb = XGBClassifier()
        xgb.fit(X_train_leaves, y_train)
        Y_pred_xgb = xgb.predict(X_test_leaves)
        xgb_auc = roc_auc_score(y_test, Y_pred_xgb)
        print('GBDT + XGB auc: %.5f' % xgb_auc)
Ejemplo n.º 7
0
def test_xgboost_sklearn_gressor():
    l1 = []
    from sklearn.datasets import load_boston
    boston = load_boston()
    xgb = XGBRegressor()
    xgb.fit(boston.data, boston.target)

    predictions = xgb.predict(boston.data)
    l1 += predictions.tolist()
    print(predictions)
    print(type(predictions))
def train_xgb(data):
    X = data.drop(['cause'], axis=1).values
    Y = data['cause'].values
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=0)
    xgb = XGBClassifier(n_estimators=300)
    xgb.fit(X_train, y_train)
    preds = xgb.predict(X_test)
    acc_xgb = (preds == y_test).sum().astype(float) / len(preds) * 100
    print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))
Ejemplo n.º 9
0
def check_performance(g_z,
                      train_data,
                      test_data,
                      data_cols,
                      label_cols=[],
                      seed=0,
                      with_class=False,
                      data_dim=2):
    #train_data,test_data = load_preprocess_aps_data()
    if len(label_cols) > 0:
        gen_df = pd.DataFrame(g_z[:, :-1], columns=data_cols)
    else:
        gen_df = pd.DataFrame(g_z, columns=data_cols)

    gen_df['failure'] = np.ones((g_z.shape[0], 1))
    combined_train_df = pd.concat([train_data, gen_df])
    print(train_data.shape, gen_df.shape, combined_train_df.shape)
    xgb_params = {
        # 'tree_method': 'hist', # for faster evaluation
        'max_depth': 3,  # for faster evaluation
        'n_estimators': 18,
        #'objective': 'binary:logistic',
        'random_state': 0,
        #'eval_metric': 'auc',  # allows for balanced or unbalanced classes
        'scale_pos_weight': 40,
        'min_child_weight': 44,
        'silent': 1
    }

    X_train = combined_train_df[combined_train_df.columns.drop(
        'failure')].values
    y_train = combined_train_df.failure

    X_test = test_data[test_data.columns.drop('failure')].values
    y_test = test_data.failure

    xgb = XGBClassifier(max_depth=3,
                        n_estimators=18,
                        n_jobs=-1,
                        scale_pos_weight=40,
                        min_child_weight=44)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)

    # dtrain = xgb.DMatrix(X_train, y_train, feature_names=data_cols + label_cols)
    # dtest = xgb.DMatrix(X_test, feature_names=data_cols + label_cols)
    # xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=10)  # limit to ten rounds for faster evaluation
    #
    # y_pred = np.round(xgb_test.predict(dtest))
    print('Test performance confusion', confusion_matrix(
        y_test, y_pred))  # assumes balanced real and generated datasets
    return aps_cost(y_pred,
                    y_test)  # assumes balanced real and generated datasets
Ejemplo n.º 10
0
def fonction_model_xgb(data):
   
    df = fonction_select_xgb(data)
   
    X = df.drop("tx_rec_marg_Bin",axis = 1)
    y = df["tx_rec_marg_Bin"]
   
    X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.3,random_state = 0)

    kf = KFold(n_splits=3)  
    kf.get_n_splits(X)



    Quant=df[[col for col in df.columns.to_list() if df[col].nunique() > 3]]
   
    num = list(Quant.columns)
   

    scaler = StandardScaler().fit(X_train[num])
    X_train[num] = scaler.transform(X_train[num])
    X_test[num]  = scaler.transform(X_test[num])
   

    xgb = XGBClassifier(criterion = 'gini',max_depth = 7, max_features = 'auto', n_estimators = 500)
   

    #grid_xgb = GridSearchCV (estimator = xgb, param_grid=param_grid ,scoring="accuracy")

    #print(grid_rf.best_params_)

    xgb.fit(X_train, y_train)


    y_pred = xgb.predict(X_test)

    print(classification_report(y_test,y_pred))
    #print(grid_xgb.best_params_)
   
    xgb_shap = XGBClassifier(criterion = 'gini',max_depth = 7, max_features = 'auto', n_estimators = 500)
   
    xgb_shap.fit(X_train, y_train)
    shap_values = shap.TreeExplainer(xgb_shap).shap_values(X_train)

    print(shap.summary_plot(shap_values, X_train, plot_type="bar"))
    print(confusion_matrix(y_test,y_pred))
    print(f1_score(y_pred,y_test, average='micro'))
    
    return data
    def runXGBoost(train_data_mix_n, train_Y, test_data_mix_n, test_Y):
        xgb = XGBClassifier(max_depth=10,
                            min_child_weight=6,
                            gamma=0.5,
                            colsample_bytree=0.7,
                            subsample=0.7,
                            reg_alpha=1)

        xgb.fit(train_data_mix_n, train_Y)

        predicted_label = xgb.predict(test_data_mix_n)
        print("Test accuracy using XGBoost Classifier")
        print(accuracy_score(test_Y, predicted_label))
        print("Confusion Metrix for XGBoost Classifier..")
        cnf_matrix = confusion_matrix(test_Y, predicted_label)
        print(cnf_matrix)
Ejemplo n.º 12
0
def xgb_model_1(X_train,y_train,X_test,params=None):
    # train with the scikit-learn API
    xgb = XGBRegressor(n_estimators=1000, max_depth=13, min_child_weight=150, 
                   subsample=0.7, colsample_bytree=0.3)
    y_test = np.zeros(len(X_test))
    for i, (train_ind, val_ind) in enumerate(KFold(n_splits=2, shuffle=True, 
                                            random_state=1989).split(X_train)):
        print("----------------------")
        print("Training model #%d" % i)
        print("----------------------")
        # XGBRegressor.fit 
        xgb.fit(X_train[train_ind], y_train[train_ind],
                eval_set=[(X_train[val_ind], y_train[val_ind])],
                early_stopping_rounds=10, verbose=25)
        
        y_test += xgb.predict(X_test, ntree_limit=xgb.best_ntree_limit)
    y_test /= 2

    return y_test
Ejemplo n.º 13
0
def predict(data):
    """预测"""

    X_all = data.drop(['FTR'], 1)
    X_all = change_type(X_all)
    X_all = one_hot_encode(X_all)

    y_all = data['FTR']
    y_all = y_all.map({'NH': 0, 'H': 1})  # 把标签映射为0和1

    # 读取模型
    xgb = joblib.load('xgboost_model_demo.model')

    # 随机抽出10条数据进行预测
    random_x = X_all.sample(n=10)
    random_y = y_all.sample(n=10)
    # 进行预测
    predict_result = xgb.predict(random_x)
    print("实际值:%s \n预测值:%s" % (random_y.values, predict_result))
Ejemplo n.º 14
0
def pred():
    xgb = XGBClassifier(booster='gbtree', gamma=0.0, max_depth=8, min_child_weight=3, learning_rate=0.03, n_jobs=-1,
                        scale_pos_weight=1, reg_alpha=0.1, reg_lambda=1, colsample_bytree=0.9, subsample=0.8,
                        n_estimators=370, objective="binary:hinge", tree_method='gpu_hist', gpu_id=0,
                        random_state=5477113)
    xgb.fit(x_combined, y_combined)
    y_pred = xgb.predict(X_test)
    team_name = 'TeamFOSAI'
    submission_index = 2
    label_file = '/media/jose/hk-data/PycharmProjects/the_speech/data/mask/labels/labels.csv'
    df_labels = pd.read_csv(label_file)
    # Write out predictions to csv file (official submission format)
    pred_file_name = task + '.' + feat_type +'.test.' + team_name + '_' + str(submission_index) + '.csv'
    print('Writing file ' + pred_file_name + '\n')
    df = pd.DataFrame(data={'file_name': df_labels['file_name'][df_labels['file_name'].str.startswith('test')].values,
                            'prediction': le.inverse_transform(y_pred).flatten()},
                      columns=['file_name','prediction'])
    df.to_csv(pred_file_name, index=False)

    print('Done.\n')
Ejemplo n.º 15
0
def predictor():
   data=request.get_json(force=True)
   a=str(data.get("rain"))
   b=str(data.get("temperature"))
   c=str(data.get("season"))
   #c=data.get("humidity")
   d=str(data.get("state"))
   e=str(data.get("year"))
   f=str(data.get("P_r"))
   array=[]
   array.append([getIndex(state_enc,d),getIndex(enc_year,e),getIndex(enc_season,c),float(b),float(a),float(6)])
   array=np.array(array)
   df=pd.DataFrame(array,columns=['State_Name','Crop_Year','Season','temperature','Rainfall','P_r'])
   prediction=xgb.predict(df) 
   prediction=np.round(prediction)
   prediction=int(prediction)
   ans=enc_crop[prediction]
   ans=[ans]
   ans=np.array(ans)
   df=pd.DataFrame(ans,columns=['crop'])
   return df.to_json(orient='records')
Ejemplo n.º 16
0
def train_xgb():
    import xgboost as xgb
    train_set, evaluation_set = split_train_set(encoding())
    # train_set, evaluation_set = split_train_set()
    train_set.fillna(0, inplace=True)
    print(train_set.head())
    print('prepare for the training...')
    features = [x for x in train_set.columns if x not in ['label']]
    y_train = train_set['label']
    X_train = train_set[features]

    y_test = evaluation_set['label']
    X_test = evaluation_set[features]

    xgb_train = xgb.DMatrix(X_train, y_train)
    xgb_test = xgb.DMatrix(X_test)
    print('X_train shape')
    print(X_train.shape)
    print('y_train shape')
    print(y_train.shape)

    params = {
        'objective': 'binary:logistic',
        'eta': 0.1,
        'colsample_bytree': 0.886,
        'min_child_weight': 2,
        'max_depth': 10,
        'subsample': 0.886,
        # 'alpha': 10,
        # 'gamma': 30,
        # 'lambda': 50,
        'verbose_eval': True,
        'eval_metric': 'auc',
        'scale_pos_weight': 10,
        'seed': 201703,
        'missing': -1
    }
    xgb = xgb.train(params, xgb_train, early_stopping_rounds=20)
    pre = xgb.predict(xgb_test)
    print(type(pre))
Ejemplo n.º 17
0
#--------------- XGBoost algorithm
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
tic7 = time.time()
xgb = XGBClassifier(objective='multi:softmax',
                    num_class=4,
                    n_fold=4,
                    colsample_bytree=1,
                    learning_rate=0.15,
                    max_depth=5,
                    n_estimators=600,
                    subsample=0.3)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)

# Get f1 score
f1_xgb = f1_score(y_test, y_pred, average='weighted')

# Append to the accuracy list
#accuracy_lst.append(acc)
#f1_lst.append(f1_xgb)

print("[XGBoost algorithm] accuracy_score: {:.3f}.".format(acc))
print("[XGBoost algorithm] f1_score: {:.3f}.".format(f1_xgb))
toc7 = time.time()
print('Elapsed time for Losigtic regression is %f seconds \n' %
Ejemplo n.º 18
0
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()

num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)

xgb.plot_importance(model, height=0.5)

num_boost_round = model.best_iteration
xgb.plot_importance(model,  height=0.5)

from xgboost.sklearn import XGBRegressor
xgb = XGBRegressor( nthread=-1,  missing= -1, n_estimators=300, learning_rate=0.02, max_depth=17, subsample=0.9
                   , min_child_weight=3, colsample_bytree=0.7, reg_alpha=100, reg_lambda=100, silent=False)
xgb.fit(x_train,y_train)
#print(x_train)
pred=xgb.predict(x_test)
predictions = [round(value) for value in pred]
"""x_test['result']=pred
x_test['crop']=y_test
x_test.to_csv('pred.csv')"""
#print accuracy_score(y_test,pred)
accuracy = accuracy_score(y_test, predictions)
  
print("Accuracy: %.2f%%" % (accuracy * 100.0))
@app.route('/predictor',methods=['POST','GET'])
def predictor():
   data=request.get_json(force=True)
   a=str(data.get("rain"))
   b=str(data.get("temperature"))
   c=str(data.get("season"))
   #c=data.get("humidity")
Ejemplo n.º 19
0
    'Supermarket Type3': 2,
    'Supermarket Type2': 1
}
datatest.Outlet_Type = [gender[item] for item in datatest.Outlet_Type]

datatest.head()

#usig Randome forest Regresser
regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=500)
regr.fit(data[data.columns[1:6]], data["Item_Outlet_Sales"])
datat.dtypes
pred = regr.predict(datatest[datatest.columns[1:6]])

#using Xgboost
xgb = xgb.XGBRegressor(n_estimators=50,
                       learning_rate=0.09,
                       gamma=0,
                       subsample=0.85,
                       colsample_bytree=1,
                       max_depth=7)
xgb.fit(data[data.columns[1:6]], data["Item_Outlet_Sales"])
pred = xgb.predict(datatest[datatest.columns[1:6]])

datat["Item_Outlet_Sales"] = pred
newdf = datat[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']]
newdf.to_csv("D://24projects//Project 3//output.csv",
             encoding='utf-8',
             index=False)

datat["Item_Outlet_Sales"].isnull().sum()
print('Time Taken: {:.2f} seconds'.format(time_taken))

# In[ ]:

print(accuracy_score(preds1, y_test))
print(classification_report(preds1, y_test))
print(confusion_matrix(preds1, y_test))

# In[ ]:

import xgboost as xgb
time1 = time.time()

xgb = xgb.XGBClassifier(n_jobs=-1)
xgb.fit(X_train, y_train)
preds2 = xgb.predict(X_test)
time_taken = time.time() - time1
print('Time Taken: {:.2f} seconds'.format(time_taken))

# In[ ]:

# manual method to check accuracy, see first 100 predictions, around 70% correct prediction
for i in range(100):
    if preds2[i] == np.array(y_test)[i]:
        print('1', end=', ')  # correct prediction
    else:
        print('0', end=', ')  # wrong prediction

# In[ ]:

preds2[0:100:5]
Ejemplo n.º 21
0
def run_benchmark(args):

    try:
        dtest = xgb.DMatrix('dtest.dm')
        dtrain = xgb.DMatrix('dtrain.dm')

        if not (dtest.num_col() == args.columns \
                and dtrain.num_col() == args.columns):
            raise ValueError("Wrong cols")
        if not (dtest.num_row() == args.rows * args.test_size \
                and dtrain.num_row() == args.rows * (1-args.test_size)):
            raise ValueError("Wrong rows")
    except:

        print("Generating dataset: {} rows * {} columns".format(
            args.rows, args.columns))
        print("{}/{} test/train split".format(args.test_size,
                                              1.0 - args.test_size))
        tmp = time.time()
        X, y = make_classification(args.rows,
                                   n_features=args.columns,
                                   n_redundant=0,
                                   n_informative=args.columns,
                                   n_repeated=0,
                                   random_state=7)
        if args.sparsity < 1.0:
            X = np.array([[
                np.nan if rng.uniform(0, 1) < args.sparsity else x
                for x in x_row
            ] for x_row in X])

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=args.test_size, random_state=7)
        print("Generate Time: %s seconds" % (str(time.time() - tmp)))

        #save to .csv file
        np.savetxt('train.csv',
                   np.concatenate(
                       (X_train, y_train.reshape((y_train.shape[0], 1))),
                       axis=1),
                   fmt='%.8f',
                   delimiter=',')
        np.savetxt('test.csv',
                   np.concatenate((X_test, y_test.reshape(
                       (y_test.shape[0], 1))),
                                  axis=1),
                   fmt='%.8f',
                   delimiter=',')

        tmp = time.time()
        print("DMatrix Start")
        dtrain = xgb.DMatrix(X_train, y_train)
        dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
        print("DMatrix Time: %s seconds" % (str(time.time() - tmp)))

        dtest.save_binary('dtest.dm')
        dtrain.save_binary('dtrain.dm')

    param = {
        'max_depth': 6,
        'eta': 0.1,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    if args.params is not '':
        param.update(ast.literal_eval(args.params))

    param['tree_method'] = args.tree_method
    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    param['nthread'] = 24
    param['eval_metric'] = 'auc'

    #xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")])
    xgb.train(param, dtrain, args.iterations)
    print("Train Time: %s seconds" % (str(time.time() - tmp)))

    # this is prediction
    preds = xgb.predict(dtest)
    y_test = dtest.get_label()
    auc_score = metrics.roc_auc_score(y_test, preds)
    logger.info('auc = %f', auc_score)
per = fs_results['per'][0]

# re-run the model with the best parameters and features selected
fs = feature_selection.SelectPercentile(feature_selection.f_classif, percentile = per)
feature_model =  fs.fit(features_train,target_train)

features_train_new = feature_model.transform(features_train)
features_test_new = feature_model.transform(features_test)

# Create the model
xgb = xgboost.XGBClassifier(n_estimators=est, learning_rate=lr, gamma=0, subsample=subsample,
                           colsample_bytree=colsample_bytree, max_depth=depth)

# Fit the model
xgb.fit(features_train_new, target_train)
pred_test = xgb.predict(features_test_new)
pred_train = xgb.predict(features_train_new)

# predict the games
predictions_train = [round(value) for value in pred_train]
predictions_test =  [round(value) for value in pred_test]

# calculate the accuracy
train_accuracy = accuracy_score(target_train, predictions_train)
test_accuracy = accuracy_score(target_test, predictions_test)

print (train_accuracy)
print (test_accuracy)

# store the predictions
pred_df1 = pd.DataFrame(predictions_test) 
Ejemplo n.º 23
0
def c(x):
    if x=='0':
        x=0
    elif x == 'a':
        x=3
    elif x =='b':
        x=2
    elif x =='c':
        x=1

StateHoliday = df_all.StateHoliday.astype(str).apply(c).values
df_all.drop(["StateHoliday"], axis=1)
df_all["StateHoliday"] = StateHoliday


vals = df_all.values

print df_all.columns

X = vals[:piv_train]
y = Sales
X_test = vals[piv_train:]


xgb = XGBRegressor(max_depth=6, learning_rate=0.3, n_estimators=25, subsample=0.5, colsample_bytree=0.5, seed=0)
xgb.fit(X, y)

y_pred = xgb.predict(X_test)
sub = pd.DataFrame(np.column_stack((ids, y_pred)), columns=['Id', 'Sales'])
sub.to_csv('sub.csv',index=False)
Ejemplo n.º 24
0
                        ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
                        ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
                        ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
                        ],
                    transformer_weights = {
                        'cst': 1.0,
                        'txt1': 0.5,
                        'txt2': 0.25,
                        'txt3': 0.0,
                        'txt4': 0.5
                        },
                n_jobs = -1
                )), 
        ('rfr', rfr))])
param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs = -1, cv = 2, verbose = 20, scoring=RMSE)
model.fit(X_train, y_train)

XGBmodel = grid_search.GridSearchCV(estimator = clfXGB, param_grid = param_grid, n_jobs = -1, cv = 2, verbose = 20, scoring=RMSE)

XGBmodel.fit(X_train,y_train)

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

y_pred = 0.25*model.predict(X_test) + 0.75*xgb.predict(X_test)
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False)
print("--- Training & Testing: %s minutes ---" % round(((time.time() - start_time)/60),2))
Ejemplo n.º 25
0
def measure_others(X_train, y_train, X_test, y_test):
    # Logistic Regression
    log_reg_params = {
        "penalty": ['none', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }

    grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
    grid_log_reg.fit(X_train, y_train)
    # We automatically get the logistic regression with the best parameters.
    log_reg = grid_log_reg.best_estimator_

    knears_params = {
        "n_neighbors": list(range(1, 10, 1)),
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }

    grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params)
    grid_knears.fit(X_train, y_train)
    # KNears best estimator
    knears_neighbors = grid_knears.best_estimator_
    k = grid_knears.best_estimator_.n_neighbors  # k nearest

    # Support Vector Classifier
    svc_params = {
        'C': [0.5, 0.7, 0.9, 1],
        'kernel': ['rbf', 'poly', 'sigmoid', 'linear']
    }
    grid_svc = GridSearchCV(SVC(), svc_params)
    grid_svc.fit(X_train, y_train)

    # SVC best estimator
    svc = grid_svc.best_estimator_

    # DecisionTree Classifier
    tree_params = {
        "criterion": ["gini", "entropy"],
        "max_depth": list(range(2, 4, 1)),
        "min_samples_leaf": list(range(5, 7, 1))
    }
    grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
    grid_tree.fit(X_train, y_train)

    # tree best estimator
    tree_clf = grid_tree.best_estimator_
    ########################################################################################

    y_pred_log_reg = log_reg.predict(X_test)
    y_pred_knear = knears_neighbors.predict(X_test)
    y_pred_svc = svc.predict(X_test)
    y_pred_tree = tree_clf.predict(X_test)

    ###############################################################################################
    gnb = GaussianNB()
    y_pred_gnb = gnb.fit(X_train, y_train).predict(X_test)
    ########################################################################################
    import xgboost as xgb
    xgb_model = xgb.XGBClassifier(objective="binary:logistic")
    xgb_params = {
        "colsample_bytree": uniform(0.7, 0.3),
        "gamma": uniform(0, 0.5),
        "learning_rate": uniform(0.03, 0.3),  # default 0.1 
        "max_depth": randint(2, 6),  # default 3
        "n_estimators": randint(100, 150),  # default 100
        "subsample": uniform(0.6, 0.4)
    }

    rand_xgb = RandomizedSearchCV(xgb_model,
                                  param_distributions=xgb_params,
                                  random_state=42,
                                  n_iter=200,
                                  cv=3,
                                  verbose=1,
                                  n_jobs=1,
                                  return_train_score=True)

    rand_xgb.fit(X_train, y_train)

    # KNears best estimator
    xgb = rand_xgb.best_estimator_

    y_pred_xgb = xgb.predict(X_test)

    return k, y_pred_log_reg, y_pred_knear, y_pred_svc, y_pred_tree, y_pred_gnb, y_pred_xgb
Ejemplo n.º 26
0
    x_test_stacking.iloc[i, 3] = x_test_stacking.iloc[i, 3] + pred_rf[i]
    x_test_stacking.iloc[i, 4] = x_test_stacking.iloc[i, 4] + pred_knn[i]
#------------------------------------------------------------------
#####################对测试集结果进行平均化处理#####33
print(x_test_stacking)

for i in range(len(x_test)):
    for j in range(5):
        x_test_stacking.iloc[i, j] = x_test_stacking.iloc[i, j] / 3

print(x_test_stacking)

###################################第二层用xgboost#############
xgb = XGBRegressor(max_depth=4,
                   learning_rate=0.005,
                   n_estimators=500,
                   silent=True,
                   objective='reg:linear',
                   subsample=0.93,
                   base_score=y_mean,
                   seed=0,
                   missing=None)
xgb.fit(x_train_stacking, y_train)
pred = xgb.predict(x_test_stacking)
print(pred)

output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred})
output.to_csv(
    'C:\\Users\\Administrator\\Desktop\\benz\\new\\test_stacking.csv',
    index=False)
                       colsample_bytree=0.3)
    y_test = np.zeros(len(X_test))

    for i, (train_ind, val_ind) in enumerate(
            KFold(n_splits=2, shuffle=True, random_state=1989).split(X_train)):
        print('----------------------')
        print('Training model #%d' % i)
        print('----------------------')

        xgb.fit(X_train[train_ind],
                y_train[train_ind],
                eval_set=[(X_train[val_ind], y_train[val_ind])],
                early_stopping_rounds=10,
                verbose=25)

        y_test += xgb.predict(X_test, ntree_limit=xgb.best_ntree_limit)
    y_test /= 2

    ### ================================================ ###

    df_sub = pd.DataFrame({
        'id':
        df_all[df_all['trip_duration'].isnull()]['id'].values,
        'trip_duration':
        np.exp(y_test)
    }).set_index('id')

    print(df_sub)
    df_sub.to_csv('~/NYC_Taxi_Trip_Duration/output/0824_xgb_387.csv')
    """
    #print (auto_classifier.config_dict)
Ejemplo n.º 28
0
        "colsample_bytree": 0.95,
        "alpha": 2e-05,
        "lambda": 10
    }
    ROUNDS = 151

    print('xgboost train:')
    xgboost = xgboost.train(params=xgb_params,
                            dtrain=d_train,
                            num_boost_round=ROUNDS,
                            verbose_eval=10)

    #lgb.plot_importance(lightGBM, figsize=(9,40))
    #plt.show()

    # save model to file
    joblib.dump(xgboost, "xgboost.model")

    df_test = load_from_hdfs('df_test')
    final_preds = xgboost.predict(df_test[features_to_use])
    df_final_pred = pd.DataFrame(df_test[['order_id', 'product_id']])
    df_final_pred['prediction'] = final_preds

    df_final_pred = df_final_pred.loc[df_final_pred.prediction > 0.01,
                                      ['order_id', 'prediction', 'product_id']]
    df_order = applyParallel(df_final_pred.groupby(df_final_pred.order_id),
                             create_products).reset_index()

    df_order[['order_id', 'products']].to_csv('../submission/submission.csv',
                                              index=False)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features1, target, test_size=0.1, random_state=42)
from sklearn import svm
svm1 = svm.SVC()
svm1.fit(X_train, y_train)  
predictionssvm = svm1.predict(X_test)
dtc=DecisionTreeClassifier()
modeldtc = dtc.fit(X_train, y_train)
predictionsdtc = dtc.predict(X_test)
adb=AdaBoostClassifier()
modeladb=adb.fit(X_train, y_train)
predictionsadb = adb.predict(X_test)
from sklearn.ensemble import GradientBoostingClassifier
xgb= GradientBoostingClassifier()
modelxbg=xgb.fit(X_train, y_train)
predictionsxgb = xgb.predict(X_test)
import operator
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier(solver='adam',activation='tanh',random_state=0)
modelmlp=mlp.fit(X_train,y_train)
predictionmlp=mlp.predict(X_test)

#4. Stacked Classifier
X=features1
y=target
clf1 = adb
clf2 = dtc
clf3 = svm1
meta = LogisticRegression()

sclf = StackingClassifier(classifiers=[meta, clf1, clf3], 
Ejemplo n.º 30
0
train_new.drop([ID],1,inplace = True)
test_new.drop([ID],1,inplace = True)
pca = PCA(.95)
pca.fit(train_new)
train_new2 = pca.transform(train_new)
test_new2 = pca.transform(test_new)
train_new3 = pd.DataFrame(train_new2)
test_new3 = pd.DataFrame(test_new2)
train_new4 = train_new
test_new4 = test_new
train_new4[ID] = train_ids
test_new4[ID] = test_ids
train_new4[target] = targets

predictors = [x for x in train_new4.columns if x not in [target,ID]]
xgb = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.08, gamma=0, subsample=0.75,colsample_bytree=1, max_depth=7)
x_train,x_cv,y_train,y_cv = train_test_split(train_new4.loc[:,predictors],train_new4.loc[:,target],test_size=0.2)
xgb.fit(x_train,y_train)
print(metrics.r2_score(y_train,xgb.predict(x_train.loc[:,predictors])))
print(metrics.r2_score(y_cv,xgb.predict(x_cv.loc[:,predictors])))
predicted_xgb = xgb.predict(test_new4.loc[:,predictors]) 
print(len(predicted_xgb))
for i in range(len(predicted_xgb)):
    if(predicted_xgb[i] <0):
        predicted_xgb[i] = 0
     
predicted = pd.DataFrame()
predicted['Id'] = test[ID]
predicted['Yield'] = predicted_xgb
predicted.to_csv("/home/ubuntu/Hackathons/Yield Prediction/submission.csv",index = False)
Ejemplo n.º 31
0
price=dataset['COST']
Data=dataset.drop(['COST'],axis=1)

x=np.array(price).reshape(-1,1)

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(Data,x,test_size=0.33,random_state=0)

import xgboost as xgb

xgb = xgb.XGBRegressor(n_estimators=10, max_depth=5, objectives = 'reg:linear' , learning_rate=0.3)
import time
dep=time.time()
xgb.fit(X_train,Y_train)
fin=time.time()-dep
predictions = xgb.predict(X_test)
from sklearn.metrics import mean_squared_error
rmse=np.sqrt(mean_squared_error(Y_test,predictions))
print("RMSE: %f" % (rmse))
from sklearn.metrics import explained_variance_score
EV=explained_variance_score(Y_test,predictions)
print("EV : %f" %(EV))

import matplotlib.pyplot as plt
import os
os.getcwd()
os.chdir('C:/Program Files (x86)/Graphviz2.38/bin')
xgb.plot_tree(xgb,num_trees=9)
plt.rcParams['figure.figsize'] = [50, 10]
plt.show()
Ejemplo n.º 32
0
over_samples_X, over_samples_y = over_samples.fit_sample(X_train, y_train)
#over_samples_X, over_samples_y = over_samples.fit_sample(X_train.values,y_train.values.ravel())
# 重抽样前的类别比例
print(y_train.value_counts() / len(y_train))
# 重抽样后的类别比例
print(pd.Series(over_samples_y).value_counts() / len(over_samples_y))

# 导入第三方包
import xgboost
import numpy as np
# 构建XGBoost分类器
xgboost = xgboost.XGBClassifier()
# 使用重抽样后的数据,对其建模
xgboost.fit(over_samples_X, over_samples_y)
# 将模型运用到测试数据集中
resample_pred = xgboost.predict(np.array(X_test))

# 返回模型的预测效果
print('模型的准确率为:\n', metrics.accuracy_score(y_test, resample_pred))
print('模型的评估报告:\n', metrics.classification_report(y_test, resample_pred))

# 计算欺诈交易的概率值,用于生成ROC曲线的数据
y_score = xgboost.predict_proba(np.array(X_test))[:, 1]
fpr, tpr, threshold = metrics.roc_curve(y_test, y_score)
# 计算AUC的值
roc_auc = metrics.auc(fpr, tpr)

# 绘制面积图
plt.stackplot(fpr, tpr, color='steelblue', alpha=0.5, edgecolor='black')
# 添加边际线
plt.plot(fpr, tpr, color='black', lw=1)