コード例 #1
0
ファイル: yelp_vegas_models.py プロジェクト: monicasjsu/yelp
scores = cross_val_score(model, X_train_final, y_train_final, cv=5)
print(
    '***************************************Lasso (0.001)***************************************'
)
print('cross validation score', scores.mean())
lasso_model_0001_train_score = lasso_0001.score(X_train_final, y_train_final)
lasso_model_0001_test_score = lasso_0001.score(X_test_final, y_test_final)

mse = mean_squared_error(y_test_final, y_pred)
rmse = np.math.sqrt(mse)
print('RMSE: {}'.format(rmse))
print('Train Score: {}'.format(lasso_model_0001_train_score))
print('Test Score: {}'.format(lasso_model_0001_test_score))

# ******************* Random Forest ************************
rfc = RandomForestClassifier()
run_model(rfc, 'Random Forest')

# ******************* Gradient Boost Classifier ************************
gbc = GradientBoostingClassifier()
run_model(gbc, 'Gradient Boost Classifier')

# ******************* XG Boost Classifier ************************
xgb = XGBClassifier()
run_model(xgb, 'XG Boost Classifier')

# After analysis the evaluation metrics, we decided to export log_reg, gradient boost and xgboost classifier
joblib.dump(log_reg, 'log_reg_stars_model.joblib')
joblib.dump(gbc, 'gbc_stars_model.joblib')
joblib.dump(xgb, 'xgb_stars_model.joblib')
コード例 #2
0
ファイル: latest.py プロジェクト: JeffMathew/Fraud_Detection
tree = rf.estimators_[
    5]  #arbitrarily taking the 5th of the 10 trees that were generated

# Export the image to a dot file
export_graphviz(tree, out_file='branch.dot', feature_names=feature_list)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('branch.dot')

# Write graph to a png file
graph.write_png('branch.png')  #displaying the 5th tree as a png image

#XGBOOST

clf = XGBClassifier(max_depth=3, n_jobs=4)
probabilities = clf.fit(trainX, trainTargets).predict_proba(
    testX)  #training the model and making predictions on the test data
preds = clf.predict(testX)

print "\nConfusion Matrix of XGBoost is:-\n"

print confusion(testTargets, preds)
cfmat(preds, 'XGBoost')  #Display Confusion Matrix

print "\nAUPRC score of XGBoost is:-\n"
print average_precision_score(testTargets,
                              probabilities[:, 1])  #Display the AUPRC score

skplt.metrics.plot_precision_recall(testTargets,
                                    probabilities)  #Plot AUPRC graph
コード例 #3
0
scores_4=[]
scores_5=[]
scores_6=[]
for i in range(100):
    print('###########Time############',i+1)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,stratify=y,random_state=i)
   
    ss=StandardScaler()
    x_train_ss=ss.fit_transform(x_train)
    x_test_ss=ss.transform(x_test)
    
    model = XGBClassifier(
        learning_rate =0.01,
        n_estimators=500,
        max_depth=3,
        min_child_weight=1,
        gamma=0,
        subsample=0.6,
        colsample_bytree=1.0,
        seed=1,
        nthread=6)
   
    model.fit(x_train_ss, y_train)  
    y_proba=model.predict_proba(x_test_ss)[:,1]
    y_pred = model.predict(x_test_ss)
	
    auc = metrics.roc_auc_score(y_test, y_proba) 
    acc = accuracy_score(y_test, y_pred)
    
    precision, recall, _thresholds = metrics.precision_recall_curve(y_test, y_proba)
    pr_auc = metrics.auc(recall, precision)
    mcc = matthews_corrcoef(y_test, y_pred)
コード例 #4
0
import pandas as pd
from xgboost.sklearn import XGBClassifier
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

clf = XGBClassifier(learning_rate=0.02,
                    n_estimators=400,
                    max_depth=7,
                    min_child_weight=1.5,
                    gamma=0.3,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='binary:logistic',
                    n_jobs=-1,
                    scale_pos_weight=1,
                    max_delta_step=5,
                    silent=False,
                    num_class=3)
fold_num = 5
df = pd.read_csv('test.csv')
df.fillna(-1)
data_array = df.values[:, 1:]

kf = KFold(n_splits=fold_num)
count = 0
aucs = []
for train_index, test_index in kf.split(data_array):
    print "count = %d" % count
    count += 1
    clf.fit(data_array[list(train_index), :-1], data_array[list(train_index),
コード例 #5
0
juiceboy = juiceboy.drop([u'Unnamed: 0', u'CustomerMD5Key', u'FirstDriverDrivingLicenseNumberY', \
                                   u'CarParkingTypeId',u'FirstDriverDrivingLicenceType',\
                                   u'CarDrivingEntitlement',  u'CarTransmissionId', u'PolicyHolderResidencyArea'\
                        , u'car_flag_1', u'car_flag_4', u'car_flag_3', u'vendor_log_sales', u'RatedDriverNumber',\
                       'CarFuelld','AllDriversNbConviction'], axis=1)

# In[ ]:
target = "labels"

xgbl = XGBClassifier(learning_rate=0.1,
                     n_estimators=1000,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)

supplied_arr = juiceboy
predictors = [
    x for x in train.columns
    if x not in [target, 'Unnamed: 0', u'CustomerMD5Key']
]
dtrain = train
alg = xgbl
useTrainCV = True
cv_folds = 5
コード例 #6
0
    train_x,train_y = getTrain()
    test_x,res = getTest()


    #开始训练模型
    clf = XGBClassifier(silent=0 ,#设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。
                        #nthread=4,# cpu 线程数 默认最大
                        learning_rate= 0.3, # 如同学习率
                        min_child_weight=1,
                        # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
                        #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
                        #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
                        max_depth=6, # 构建树的深度,越大越容易过拟合
                        gamma=0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。
                        subsample=1, # 随机采样训练样本 训练实例的子采样比
                        max_delta_step=0,#最大增量步长,我们允许每个树的权重估计。
                        colsample_bytree=1, # 生成树时进行的列采样
                        reg_lambda=1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
                        #reg_alpha=0, # L1 正则项参数
                        #scale_pos_weight=1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重
                        #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标
                        #num_class=10, # 类别数,多分类与 multisoftmax 并用
                        n_estimators=100, #树的个数
                        seed=1000 #随机种子
                        #eval_metric= 'auc'
                      )
    clf.fit(train_x, train_y)
    #测试数据
    y_pre = clf.predict(test_x)
    print(y_pre)
    #获取打分值
コード例 #7
0
def optimize_max_depth_and_min_child_weight ( model, x, t ):
    
    # Local variables.
    
    indent = 3
    
    log ( 'Optimizing max depth and min child weight:', indent = indent )
    
    # Initialize ssearch parameters.

    grid_resolution = 3        
        
    max_depth_min    = 1
    max_depth_max    = 9  
    max_depth_stride = grid_resolution
    
    min_child_weight_min    = 1
    min_child_weight_max    = 9    
    min_child_weight_stride = grid_resolution
    
    # Configure grid search.
    
    parameter_search_1 = {
        'max_depth'        : list ( range ( max_depth_min,        max_depth_max,        max_depth_stride        ) ),
        'min_child_weight' : list ( range ( min_child_weight_min, min_child_weight_max, min_child_weight_stride ) )
    }
     
    # Perform grid search.
    
    grid_search_1 = GridSearchCV (
    
        estimator = XGBClassifier (
            learning_rate    = Constant.Model.LEARNING_RATE,
            n_estimators     = Constant.Model.N_ESTIMATORS,
            max_depth        = Constant.Model.MAX_DEPTH,
            min_child_weight = Constant.Model.MIN_CHILD_WEIGHT,
            gamma            = Constant.Model.GAMMA,
            subsample        = Constant.Model.SUBSAMPLE,
            colsample_bytree = Constant.Model.COLSAMPLE_BYTREE,
            objective        = Constant.Model.OBJECTIVE,
            scale_pos_weight = Constant.Model.SCALE_POS_WEIGHT,        
            seed             = Constant.Model.SEED     
        ),
        
        param_grid = parameter_search_1,
        scoring    = Constant.Model.GridSearch.Tree.SCORING,
        cv         = Constant.Model.GridSearch.Tree.CV,
        verbose    = Constant.Model.GridSearch.Tree.VERBOSE,
        n_jobs     = 1,
        iid        = False
    )
    
    grid_search_1.fit ( x, t )
    
    # Report results.
    
    for e in grid_search_1.grid_scores_:
        log ( str (e) , indent = indent+1 )
    
    min_child_weight = grid_search_1.best_params_ [ 'min_child_weight' ]
    max_depth        = grid_search_1.best_params_ [ 'max_depth'        ]
    
    log ( 'Optimal min child weight = ' + str ( min_child_weight           ), indent = indent+1 )
    log ( 'Optimal max depth        = ' + str ( max_depth                  ), indent = indent+1 )
    log ( 'Best Score               = ' + str ( grid_search_1.best_score_  ), indent = indent+1 )
        
    # update model using optimized parameters.
    
    model.set_params ( min_child_weight = min_child_weight )
    model.set_params ( max_depth        = max_depth )
    
    # Return updated model.
    
    return model
コード例 #8
0
#data1_new_train.to_csv("data_train.csv")
data1_new_test = data1_new.ix[11017:, :]
X_train, X_test, Y_train, Y_test = train_test_split(data1_new_train,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=42)

#训练xgboost模型
#设置初试参数
xgb_train1 = XGBClassifier(booster="gbtree",
                           learning_rate=0.02,
                           n_estimators=800,
                           max_depth=3,
                           min_child_weight=4,
                           gamma=0,
                           reg_alpha=10,
                           reg_lambda=10,
                           subsample=0.8,
                           colsample_bytree=0.8,
                           objective='binary:logistic',
                           nthread=8,
                           scale_pos_weight=4,
                           seed=27)
xgb_train1.fit(X_train, Y_train, eval_metric="auc")
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(xgb_train1, threshold=0.01, prefit=True)
X_new = model.transform(X_train)
X_new_test = model.transform(X_test)
xgb_train2 = XGBClassifier(booster="gbtree",
                           learning_rate=0.01,
                           n_estimators=1000,
                           max_depth=5,
コード例 #9
0
np.random.seed(seed)


params_grid = {
    'max_depth':[1, 3, 5, 7, 9],
    'n_estimators':[10, 100, 1000],
    'learning_rate':np.linspace(1e-16, 1, 3),
    'min_child_weight':[1, 3, 5, 7],
    'gamma':[0, 0.1, 0.3, 0.5]
}

params_fixed = {
    'learning_rate': 0.1,
    'silent': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'multi:softprob',
    'nthread': 4,
    'scale_pos_weight': 1,
}

bst_grid = GridSearchCV(
    estimator=XGBClassifier(**params_fixed, seed = seed),
    param_grid=params_grid,
    cv = 5,
    scoring='accuracy'
    )
train_labels = train_labels.astype(str)
bst_grid.fit(train, train_labels)

bst_grid.grid_scores_
コード例 #10
0
X = data_removed.iloc[:, :-1]
y = data_removed.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_pred = clf.predict(X_test)
print("***********Decision Tree Classifier************")
print("Train Accuracy :", round(clf.score(X_train, y_train), 3))
print("Test Accuracy ", round(metrics.accuracy_score(y_test, clf_pred), 3))

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print("*************Xgboost Classifier************")
print("Train Accuracy of xgb:", round(xgb.score(X_train, y_train), 3))
print("Test Accuracy ", round(metrics.accuracy_score(y_test, xgb_pred), 3))

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print("*************Random Forest Classifier************")
print("Train Accuracy of Random Forest C:",
      round(rfc.score(X_train, y_train), 3))
print("Test Accuracy ", round(metrics.accuracy_score(rfc_pred, xgb_pred), 3))
#%%   Regularization
from sklearn.model_selection import GridSearchCV
コード例 #11
0
from xgboost.sklearn import XGBClassifier

# Parameter setting
param = {}
param['objective'] = 'binary:logistic'
param['gamma'] = 0
param['max_depth'] = 6
param['min_child_weight'] = 1
param['max_delta_step'] = 0
param['subsample'] = 1
param['colsample_bytree'] = 1
param['silent'] = 1
param['seed'] = 0
param['base_score'] = 0.5

xclas = XGBClassifier(**param)
xclas.fit(x, y)
result = xclas.predict(x_test)

text = open(fileName, "w+")
s = csv.writer(text, delimiter=',', lineterminator='\n')
s.writerow(["id", "label"])
for i in range(len(result)):
    s.writerow([i + 1, int(result[i])])
text.close()

#    X_train_path = '/home/tappy/X_train'
#    Y_train_path ='/home/tappy/Y_train'
#    X_test_path ='/home/tappy/X_test'
#    fileName = '/home/tappy/predictdata.csv'
#    # X_train_path = 'C:/Users/TappyHsieh/Desktop/X_train'
コード例 #12
0
def xgb_model_cv(param_set):

	mean_roc_cv = 0
	mean_F1_score_cv = 0
	mean_recall_score_cv = 0

	# Initialize the XGBClassifier with the dictionary parameters
	clf = XGBClassifier(max_depth=param_set['max_depth'], learning_rate=param_set['learning_rate'], n_estimators=param_set['n_estimators'], silent=True, objective='binary:logistic', nthread=-1, gamma=param_set['gamma'], min_child_weight=param_set['min_child_weight'], max_delta_step=0, subsample=param_set['subsample'], colsample_bytree = param_set['colsample_bytree'])

	for i in range(5):

		#create the dataframe that will contain all the features and labels
		train_df = pd.read_csv("subsets/train_cv_set3"+str(i)+".csv")
		test_df = pd.read_csv("subsets/test_cv_set3"+str(i)+".csv")

		train_df.sort_values('index', axis = 0, inplace=True)
		train_df = train_df.set_index('index')
		train_label_df = train_df.pop('class')

		test_df.sort_values('index', axis = 0, inplace=True)
		test_df = test_df.set_index('index')
		test_label_df = test_df.pop('class')

		# Train the classifier on the train set
		clf.fit(train_df, train_label_df)

		# Make predictions on the train set
		train_pred = clf.predict(train_df)
		train_predprob = clf.predict_proba(train_df)[:,1]

		# Make predictions on the test set
		test_pred = clf.predict(test_df)
		test_predprob = clf.predict_proba(test_df)[:,1]

		# Display the metrics score on the train set
		print "ROC score", roc_auc_score(train_label_df, train_pred, average = 'macro')
		print "ROC score proba", roc_auc_score(train_label_df, train_predprob, average = 'macro')
		print "F1 score for training set: {:.4f}.".format(f1_score(train_label_df, train_pred, pos_label=1.0))
		print "Recall score for training set: {:.4f}.".format(recall_score(train_label_df, train_pred, pos_label=1.0, average='binary'))

		# Display the metrics score on the test set
		roc_score_cv = roc_auc_score(test_label_df, test_pred, average = 'macro')
		F1_score_cv = f1_score(test_label_df, test_pred, pos_label=1.0)
		recall_score_cv = recall_score(test_label_df, test_pred, pos_label=1.0, average='binary')
		print "ROC score cv", roc_score_cv
		print "ROC score proba cv", roc_auc_score(test_label_df, test_predprob, average = 'macro')
		print "F1 score for cv set: {:.4f}.".format(F1_score_cv)
		print "Recall score for cv set: {:.4f}.".format(recall_score_cv)

		# Calculate the average metrics scores over the 5-folds cv tests
		mean_roc_cv += roc_score_cv/5
		mean_F1_score_cv += F1_score_cv/5
		mean_recall_score_cv += recall_score_cv/5

	# Display the average metrics scores over the 5-folds cv tests
	print "mean_roc_cv", mean_roc_cv
	print "mean_F1_score_cv", mean_F1_score_cv
	print "mean_recall_score_cv", mean_recall_score_cv

	score = [mean_roc_cv, mean_F1_score_cv, mean_recall_score_cv]
	
	return score
コード例 #13
0
X = train
# test = test.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=125)
# X_train = X_train.as_matrix()
# X_test = X_test.as_matrix()
# y_train = y_train.as_matrix()
# y_test = y_test.as_matrix()

# --------------------- Creating model

xgbclassifier = XGBClassifier(n_estimators=100,
                              nthread=-1,
                              silent=False,
                              seed=125,
                              learning_rate=0.2)
# xgbmodel = xgbclassifier.fit(X_train, y_train)
xgbmodel = xgbclassifier.fit(X, Y)

# pred = xgbmodel.predict(test)
xgbmodel.score(X_test, y_test)
pred = xgbmodel.predict_proba(test)[:, 1]

# --------------------- Writing results

pred_df['click'] = pred
file_name = "Predictions\\prediction_" + str(datetime.datetime.now().date()) + "_" +\
            str(datetime.datetime.now().strftime("%H%M%S")) + ".csv"
pred_df.to_csv(path_or_buf="Predictions\\prediction_9.csv", index=False)
コード例 #14
0
ファイル: stack2.py プロジェクト: OOmegaPPanDDa/ML2017
test = np.loadtxt("./stack1_test.txt")

target = pd.read_csv('./target.csv', index_col=0)

nfold = 10

outcome = target['status_group']

## Classifiers

# XGB
xgb_classifier = XGBClassifier(max_depth=7,
                               learning_rate=0.02,
                               n_estimators=200,
                               gamma=0.08,
                               min_child_weight=3,
                               subsample=0.5,
                               colsample_bytree=0.9,
                               reg_alpha=0.2,
                               objective='multi:softmax')

# logistic regr
log_regr_classifier = LogisticRegression(C=10**(3), max_iter=800)

classifiers = [xgb_classifier, log_regr_classifier]

print("Start Stacking Models")

log_loss_eval_rec = np.zeros((nfold, len(classifiers)))
acc_eval_rec = np.zeros((nfold, len(classifiers)))
コード例 #15
0
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)
X_test = vals[piv_train:]

#Classifier
xgb = XGBClassifier(max_depth=6,
                    learning_rate=0.25,
                    n_estimators=43,
                    objective='multi:softprob',
                    subsample=0.6,
                    colsample_bytree=0.6,
                    seed=0)
#xgb = gbc(n_estimators=1, learning_rate=.15, max_depth=)
#xgb = knn(n_neighbors=50)
#xgb = etc(n_estimators=60, n_jobs=2)

xgb.fit(X, y)

y_pred = xgb.predict_proba(X_test)

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
pred = []  #prediction prob
コード例 #16
0
ファイル: all_models.py プロジェクト: modelmanagement/CAPM
 def __init__(self):
     self.model = XGBClassifier(max_depth=3, n_jobs=4, random_state=42)
     self.name = 'XGB'
コード例 #17
0
def mainLoop(df, data_dir, tissue_dir):
    data_dir = Path("data")
    tissue_dir = Path("tissue-specific")
    results = []
    algs = []
    TISSUE_list = df['SMTS'].unique()
    for TISSUE in TISSUE_list:
        cpm, cdat = loadData(TISSUE, data_dir, tissue_dir)
        #print(cpm.shape)
        #print(cdat.shape)
        all_age = df.loc[(df['SMTS'] == TISSUE), 'AGE']
        #print(all_age.shape)
        cpm_train, cpm_test, y_train, y_test = splitData(cpm, all_age)
        c_train, c_test, y_train, y_test = splitData(cdat, all_age)
        #y_train.map({'20-29':0,'30-39':1,'40-49':2,'50-59':3, '60-69':4,'70-79':5})
        #y_test.map({'20-29':0,'30-39':1,'40-49':2,'50-59':3, '60-69':4,'70-79':5})
        y_train.replace(
            {
                "20-29": 0,
                "30-39": 1,
                "40-49": 2,
                "50-59": 3,
                "60-69": 4,
                "70-79": 5
            },
            inplace=True)
        y_test.replace(
            {
                "20-29": 0,
                "30-39": 1,
                "40-49": 2,
                "50-59": 3,
                "60-69": 4,
                "70-79": 5
            },
            inplace=True)
        #print(y_train.shape)
        #print(cpm_train.shape)
        #print(c_train.shape)
        keep = simpleExpressionFilter(c_train, 10)
        cpm_train = cpm_train.loc[:, (keep)]
        cpm_test = cpm_test.loc[:, (keep)]

        selector = sklearn.feature_selection.VarianceThreshold(threshold=.1)
        selector.fit(cpm_train)
        keep = selector.get_support(indices=True)
        cpm_train = cpm_train.iloc[:, keep]
        cpm_test = cpm_test.iloc[:, keep]

        xgb1 = XGBClassifier(learning_rate=0.1,
                             n_estimators=1000,
                             max_depth=5,
                             min_child_weight=1,
                             gamma=0,
                             subsample=0.8,
                             colsample_bytree=0.8,
                             objective='multi:softprob',
                             nthread=30,
                             scale_pos_weight=1,
                             seed=1234)
        xgb1 = predOneTissue(xgb1, cpm_train, y_train)
        y_preds = xgb1.predict(cpm_test)
        score = scoreOneTissue(y_test, y_preds)
        #print(y_test,y_preds)
        results.append(score)
        algs.append(xgb1)
    results = pd.Series(results, index=TISSUE_list)
    return (algs, results)
コード例 #18
0
# Disabling other pipes because we don't need them and it'll speed up this part a bit
with nlp.disable_pipes():
    docs = list(nlp.pipe(tweets))
    doc_vectors = np.array([doc.vector for doc in docs])
    
print("doc vectors shape=", doc_vectors.shape)
    
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, labels, test_size=0.3, random_state=1, stratify=labels)

xgb_model=None
xgb_model = XGBClassifier(
    n_estimators=100,
    scale_pos_weight=SCALE_FACTOR,
    objective='binary:logistic',
    colsample=0.9,
    colsample_bytree=0.5,
    eta=0.1,
    max_depth=8,
    min_child_weight=6,
    subsample=0.9)

print("Training xgb model....")
xgb_model.fit(X_train, y_train)

print("score=", xgb_model.score)

preds = xgb_model.predict(X_test)

print("f1 score=", f1_score(preds, y_test))

# we get an f1 score of arroud 0.64
コード例 #19
0
def optimize_regularization_parameters ( model, x, t ):

    # Local variables.
    
    indent = 3
    
    log ( 'Optimizing max depth and min child weight:', indent = indent )
       
    # Initialize grid search parameters.

    reg_alpha_locus              =  5.0
    reg_alpha_min_index          = -3
    reg_alpha_max_index          =  3    
    reg_alpha_index_stride_scale =  0.5
    reg_alpha_index_range        = range ( reg_alpha_min_index, reg_alpha_max_index + 1, 1 )
    reg_alpha_search_domain      = [ reg_alpha_locus + ( x * reg_alpha_index_stride_scale ) for x in reg_alpha_index_range ]
    
    reg_lambda_locus              =  1.0
    reg_lambda_min_index          = -5
    reg_lambda_max_index          =  1    
    reg_lambda_index_stride_scale =  0.1
    reg_lambda_index_range        = range ( reg_lambda_min_index, reg_lambda_max_index + 1, 1 )
    reg_lambda_search_domain      = [ reg_lambda_locus + ( x * reg_lambda_index_stride_scale ) for x in reg_lambda_index_range ]
       
    # Configure grid search.
    
    parameter_search_1 = {
        'reg_alpha'  : reg_alpha_search_domain,
        'reg_lambda' : reg_lambda_search_domain
    }
     
    # Perform grid search.
    
    grid_search_1 = GridSearchCV (
    
        estimator = XGBClassifier (
            learning_rate    = Constant.Model.LEARNING_RATE,
            n_estimators     = Constant.Model.N_ESTIMATORS,
            max_depth        = Constant.Model.MAX_DEPTH,
            min_child_weight = Constant.Model.MIN_CHILD_WEIGHT,
            gamma            = Constant.Model.GAMMA,
            subsample        = Constant.Model.SUBSAMPLE,
            colsample_bytree = Constant.Model.COLSAMPLE_BYTREE,
            objective        = Constant.Model.OBJECTIVE,
            scale_pos_weight = Constant.Model.SCALE_POS_WEIGHT,        
            seed             = Constant.Model.SEED     
        ),
        
        param_grid = parameter_search_1,
        scoring    = Constant.Model.GridSearch.Tree.SCORING,
        cv         = Constant.Model.GridSearch.Tree.CV,
        verbose    = Constant.Model.GridSearch.Tree.VERBOSE,
        n_jobs     = 1,
        iid        = False
    )
    
    grid_search_1.fit ( x, t )
    
    # Report results.
    
    for e in grid_search_1.grid_scores_:
        log ( str (e) , indent = indent+1 )
    
    reg_alpha  = grid_search_1.best_params_ [ 'reg_alpha'  ]
    reg_lambda = grid_search_1.best_params_ [ 'reg_lambda' ]
    
    log ( 'Optimal regularization alpha  = ' + str ( reg_alpha                  ), indent = indent+1 )
    log ( 'Optimal regularization lambda = ' + str ( reg_lambda                 ), indent = indent+1 )
    log ( 'Best Score                    = ' + str ( grid_search_1.best_score_  ), indent = indent+1 )
        
    # update model using optimized parameters.
    
    model.set_params ( reg_alpha  = reg_alpha )
    model.set_params ( reg_lambda = reg_lambda )
    
    # Return updated model.
    
    return model
コード例 #20
0
ファイル: hw3.py プロジェクト: HuaTsai/NCTU_Courses
    print('Accuracy:%.4g' % metrics.accuracy_score(df['label'], pred))
    print('AUC(train):%f' % metrics.roc_auc_score(df['label'], predprob))
    feat_imp = pd.Series(
        alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='feature importances')
    plt.ylabel('fm_score')


# %%
# 4.2. tuning parameters
predictors = [x for x in df.columns if x not in ['label']]
clf = XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='binary:logistic',
                    scale_pos_weight=1)
modelfit(clf, df2, predictors, cv_folds=10)

# %%
clf.set_params(n_estimators=193)

# %%
param1 = {'max_depth': range(3, 12, 2), 'min_child_weight': range(1, 6, 2)}
gs1 = GridSearchCV(estimator=clf,
                   param_grid=param1,
                   scoring='f1_weighted',
                   n_jobs=-1,
コード例 #21
0
def optimize_subsample_and_colsample_bytree ( model, x, t ):
    
    # Local variables.
    
    indent = 3
    
    log ( 'Optimizing sub-sample and column sample by tree:', indent = indent )
    
    # Initialize ssearch parameters.
    
    subsample_min    = 0.1
    subsample_max    = 0.4  
    subsample_stride = 0.01
    
    colsample_bytree_min    = 0.05
    colsample_bytree_max    = 0.1    
    colsample_bytree_stride = 0.01
    
    # Configure grid search.
    
    parameter_search_1 = {
        'subsample'        : np.arange ( subsample_min,        subsample_max,        subsample_stride        ),
        'colsample_bytree' : np.arange ( colsample_bytree_min, colsample_bytree_max, colsample_bytree_stride )
    }
     
    # Perform grid search.
    
    grid_search_1 = GridSearchCV (
    
        estimator = XGBClassifier (
            learning_rate    = Constant.Model.LEARNING_RATE,
            n_estimators     = Constant.Model.N_ESTIMATORS,
            max_depth        = Constant.Model.MAX_DEPTH,
            min_child_weight = Constant.Model.MIN_CHILD_WEIGHT,
            gamma            = Constant.Model.GAMMA,
            subsample        = Constant.Model.SUBSAMPLE,
            colsample_bytree = Constant.Model.COLSAMPLE_BYTREE,
            objective        = Constant.Model.OBJECTIVE,
            scale_pos_weight = Constant.Model.SCALE_POS_WEIGHT,        
            seed             = Constant.Model.SEED     
        ),
        
        param_grid = parameter_search_1,
        scoring    = Constant.Model.GridSearch.Tree.SCORING,
        cv         = Constant.Model.GridSearch.Tree.CV,
        verbose    = Constant.Model.GridSearch.Tree.VERBOSE,
        n_jobs     = 1,
        iid        = False
    )
    
    grid_search_1.fit ( x, t )
    
    # Report results.
    
    if False:
        for e in grid_search_1.grid_scores_:
            log ( str (e) , indent = indent+1 )
    
    subsample        = grid_search_1.best_params_ [ 'subsample'        ]
    colsample_bytree = grid_search_1.best_params_ [ 'colsample_bytree' ]
    
    log ( 'Optimal subsample        = ' + str ( subsample                  ), indent = indent+1 )
    log ( 'Optimal colsample_bytree = ' + str ( colsample_bytree           ), indent = indent+1 )
    log ( 'Best Score               = ' + str ( grid_search_1.best_score_  ), indent = indent+1 )
        
    # update model using optimized parameters.
    
    model.set_params ( subsample        = subsample )
    model.set_params ( colsample_bytree = colsample_bytree )
    
    # Return updated model.
    
    return model
コード例 #22
0
    split_size = size_1/4
    print "split_size: " + str(split_size)

    df_All_1_a = df_All_1.iloc[0:split_size]
    df_All_1_b = df_All_1.iloc[(split_size+1):2*split_size]
    df_All_1_c = df_All_1.iloc[(2*split_size+1):3*split_size]
    df_All_1_d = df_All_1.iloc[(3*split_size+1):]

    #####################################################
    #####################################################
    print df_All_1_a.shape[0]
    df_All_train =  pd.concat([df_All_0, df_All_1_a], axis=0)
    df_All_train = shuffle(df_All_train)
    X_train = df_All_train.drop(["certid", "label"], axis=1, inplace=False)
    y_train = df_All_train["label"]
    clf = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, gamma=0.01, subsample=0.8, colsample_bytree=0.8,
                        objective='binary:logistic', reg_alpha=0.1, reg_lambda=0.1, seed=27)
    clf = clf.fit(X_train, y_train)
    X_test = df_All_test.drop(["certid", "label"], axis=1, inplace=False)
    pred = clf.predict(X_test).T
    cerid_arr = np.array(df_All_test["certid"]).T
    cerid_arr = np.vstack((cerid_arr, pred))

    name_a = "test_a_" + str(i) + ".csv"
    np.savetxt(name_a,cerid_arr.T,delimiter=',', fmt = "%s")

    ###################################################################
    print df_All_1_b.shape[0]
    df_All_train =  pd.concat([df_All_0, df_All_1_b], axis=0)
    df_All_train = shuffle(df_All_train)
    X_train = df_All_train.drop(["certid", "label"], axis=1, inplace=False)
    y_train = df_All_train["label"]
コード例 #23
0
clf_dt = clf_dt.fit(X_train, y_train)
clf_dt_file = open("clf_dt_adaboost_mdc1.pkl", "wb")
pickle.dump(clf_dt, clf_dt_file)
clf_dt_file.close()
test_accuracy = checkAccuracy(clf_dt)
print("Test Accuracy for Ada boost Classifier: ", test_accuracy)

# from sklearn.ensemble import GradientBoostingRegressor
# clf_dt = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1)
# print(clf_dt)
# test_accuracy = checkAccuracy(clf_dt)
# print("Test Accuracy for Gradient Boosting Classifier: ",test_accuracy)

from xgboost.sklearn import XGBClassifier

clf_dt = XGBClassifier(n_estimators=1000)
print(clf_dt)
clf_dt = clf_dt.fit(X_train, y_train)
clf_dt_file = open("clf_dt_xgb_mdc1.pkl", "wb")
pickle.dump(clf_dt, clf_dt_file)
clf_dt_file.close()
#Fit clf to the training data
test_accuracy = checkAccuracy(clf_dt)
print("Test Accuracy for XGB Classifier: ", test_accuracy)

# from hyperopt import fmin, tpe, hp, STATUS_OK,Trials
# space ={
#     'n_estimators':hp.quniform('n_estimators',5,10,1),
#     'learning_rate':hp.quniform('learning_rate',0.025,0.1,0.025),
#     'max_depth':hp.quniform('max_depth',1,13,1),
#     'min_child_weight': hp.quniform('min_child_weight',1,6,1),
コード例 #24
0
ファイル: xgboost.py プロジェクト: akki8087/XGBoost
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


# Fitting XGBoost to the Training set
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print(accuracies.mean())
print(accuracies.std())
コード例 #25
0
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.539663321713
exported_pipeline = XGBClassifier(learning_rate=0.1,
                                  max_depth=1,
                                  n_estimators=120,
                                  silent=1.0)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
コード例 #26
0
    """
    #%%

    #%%
    #Feature selection
    """
    xgb_fea = XGBClassifier( learning_rate =0.3, max_depth=4, min_child_weight=1, 
                         subsample=0.8, gamma=0, colsample_bytree=0.8, 
                         objective= 'binary:logistic', nthread=4, scale_pos_weight=1, 
                         seed=27)
    XGBFEA(xgb_fea, x,y,2000)
    """
    #%%
    #The best estimator
    xgbcv = XGBClassifier(learning_rate =0.05, n_estimators=1475, max_depth=4,reg_alpha=0.01,
                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                         objective= 'binary:logistic', nthread=4, scale_pos_weight=8, seed=27)
    xgbcv=xgbcv.fit(x_train,y_train)
    y_pred=xgbcv.predict(x_val)

    print("accuracy_score:"+str(accuracy_score(y_val,y_pred)))
    print(confusion_matrix(y_val,y_pred))
    print(classification_report(y_val,y_pred,digits=4))
    print(y_pred.sum()/len(y_pred))
    #%%
    """
    Extract 20 important features
    """

    imp_features=['TARGET','SK_ID_CURR','EXT_SOURCE_3', 'DAYS_BIRTH', 'AMT_CREDIT', 'EXT_SOURCE_2', 'AMT_ANNUITY', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION', 'DAYS_EMPLOYED', 'DAYS_LAST_PHONE_CHANGE', 'AMT_INCOME_TOTAL' ,'REGION_POPULATION_RELATIVE', 'YEARS_BEGINEXPLUATATION_AVG', 'HOUR_APPR_PROCESS_START', 'FLOORSMAX_AVG', 'AMT_REQ_CREDIT_BUREAU_YEAR' ,'OBS_30_CNT_SOCIAL_CIRCLE','CODE_GENDER' , 'DEF_30_CNT_SOCIAL_CIRCLE','CODE_GENDER','NAME_FAMILY_STATUS','NAME_CONTRACT_TYPE']
    Imxytrain=PTrain_o.loc[:,imp_features].reindex()
コード例 #27
0
def perform_tuning(X_train, Y_train):
    print("Tuning max_depth ...")
    max_depth_search_space = np.linspace(4, 14, num=6, dtype=np.int)
    best_score_list = []
    for max_depth in max_depth_search_space:
        estimator = XGBClassifier(max_depth=max_depth,
                                  learning_rate=0.1,
                                  n_estimators=1000000,
                                  min_child_weight=5,
                                  subsample=0.8,
                                  colsample_bytree=0.8,
                                  objective=OBJECTIVE)
        _, best_score = evaluate_estimator(estimator, X_train, Y_train)
        best_score_list.append(best_score)
    best_score_index = GET_BEST_SCORE_INDEX(best_score_list)
    optimal_max_depth = max_depth_search_space[best_score_index]
    print("The optimal max_depth is {:d}.".format(optimal_max_depth))

    print("Tuning subsample ...")
    subsample_search_space = np.linspace(0.6, 1, num=5)
    best_score_list = []
    for subsample in subsample_search_space:
        estimator = XGBClassifier(max_depth=optimal_max_depth,
                                  learning_rate=0.1,
                                  n_estimators=1000000,
                                  min_child_weight=5,
                                  subsample=subsample,
                                  colsample_bytree=0.8,
                                  objective=OBJECTIVE)
        _, best_score = evaluate_estimator(estimator, X_train, Y_train)
        best_score_list.append(best_score)
    best_score_index = GET_BEST_SCORE_INDEX(best_score_list)
    optimal_subsample = subsample_search_space[best_score_index]
    print("The optimal subsample is {:f}.".format(optimal_subsample))

    print("Tuning min_child_weight ...")
    min_child_weight_search_space = np.linspace(1, 9, num=5)
    best_score_list = []
    for min_child_weight in min_child_weight_search_space:
        estimator = XGBClassifier(max_depth=optimal_max_depth,
                                  learning_rate=0.1,
                                  n_estimators=1000000,
                                  min_child_weight=min_child_weight,
                                  subsample=optimal_subsample,
                                  colsample_bytree=0.8,
                                  objective=OBJECTIVE)
        _, best_score = evaluate_estimator(estimator, X_train, Y_train)
        best_score_list.append(best_score)
    best_score_index = GET_BEST_SCORE_INDEX(best_score_list)
    optimal_min_child_weight = min_child_weight_search_space[best_score_index]
    print("The optimal min_child_weight is {:f}.".format(
        optimal_min_child_weight))

    print("Tuning colsample_bytree ...")
    colsample_bytree_search_space = np.linspace(0.6, 1, num=5)
    best_score_list = []
    for colsample_bytree in colsample_bytree_search_space:
        estimator = XGBClassifier(max_depth=optimal_max_depth,
                                  learning_rate=0.1,
                                  n_estimators=1000000,
                                  min_child_weight=optimal_min_child_weight,
                                  subsample=optimal_subsample,
                                  colsample_bytree=colsample_bytree,
                                  objective=OBJECTIVE)
        _, best_score = evaluate_estimator(estimator, X_train, Y_train)
        best_score_list.append(best_score)
    best_score_index = GET_BEST_SCORE_INDEX(best_score_list)
    optimal_colsample_bytree = colsample_bytree_search_space[best_score_index]
    print("The optimal colsample_bytree is {:f}.".format(
        optimal_colsample_bytree))

    optimal_parameters = [
        optimal_max_depth, optimal_min_child_weight, optimal_subsample,
        optimal_colsample_bytree
    ]
    print("The optimal parameters are as follows:")
    print(optimal_parameters)
    return optimal_parameters
コード例 #28
0
print(target_vector.shape)

categorical_df = df[['Weekday', 'DepartmentDescription']]
categorical_df = categorical_df.fillna('-1')
df['Upc'] = df.fillna(df['Upc'].mean())
df['FinelineNumber'] = df.fillna(df['FinelineNumber'].mean())
df = df.drop(['Weekday', 'DepartmentDescription'], axis=1)
df = df.fillna(0)
#Apply LabelEncoder
encoder = LabelEncoder()
temp_df = categorical_df.apply(encoder.fit_transform)

#combine data
df1 = df[['VisitNumber']].values
df2 = df[['Upc']].values
df3 = df[['FinelineNumber']].values

#prints = [print(x.shape) for x in [df1,df2,df3]]

print(df)
print(temp_df.shape)
final_train_data = np.concatenate((df1,df2,df3,temp_df), axis=1)

#train the model
model = XGBClassifier(silent=False)
print("Training the model....")
model.fit(final_train_data,target_vector)
print("Dumping model....")
f = open('walmart.pkl', 'wb+')
pickle.dump(model, f)

x_train, x_valid, y_train, y_valid = train_test_split(X, y, train_size=0.95, test_size=0.05,shuffle=False)


# In[75]:


x_test.shape, x_train.shape,test.shape, train.shape


# In[76]:


gbr = XGBClassifier(missing=np.nan, 
                    learning_rate = 0.15, 
                    gamma=1, 
                    colsample_bytree=0.8)


# In[77]:


gbr.fit(X, y)


# In[78]:


pred_cv = gbr.predict(x_valid)

コード例 #30
0
def create_and_save_model(X, y):

    clf1 = XGBClassifier(learning_rate=0.1,
                         n_estimators=1000,
                         max_depth=5,
                         min_child_weight=1,
                         gamma=0,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         objective='binary:logistic',
                         nthread=4,
                         scale_pos_weight=0.2,
                         seed=27,
                         reg_alpha=0.4,
                         reg_lambda=1,
                         early_stopping_rounds=50,
                         show_progress=True)

    clf2 = AdaBoostClassifier(n_estimators=150)

    # initialize the base classifier
    base_cls = DecisionTreeClassifier()

    # no. of base classifier
    num_trees = 200

    # bagging classifier
    clf3 = BaggingClassifier(base_estimator=base_cls,
                             n_estimators=num_trees,
                             random_state=8,
                             n_jobs=-1)

    clf4 = RandomForestClassifier(bootstrap=True,
                                  class_weight={
                                      0: 2.5,
                                      1: 1
                                  },
                                  criterion='entropy',
                                  max_depth=60,
                                  max_features="auto",
                                  max_leaf_nodes=50,
                                  min_impurity_decrease=0.0,
                                  min_impurity_split=None,
                                  min_samples_leaf=5,
                                  min_samples_split=6,
                                  min_weight_fraction_leaf=0.0,
                                  n_estimators=200,
                                  n_jobs=-1,
                                  oob_score=True,
                                  random_state=10,
                                  verbose=1,
                                  warm_start=False)

    params = {
        'n_estimators': 200,
        'max_depth': 20,
        'subsample': 0.6,
        'learning_rate': 0.01,
        'min_samples_leaf': 1,
        'random_state': 3,
        'loss': 'exponential',
        'max_features': 'auto',
        'verbose': 1
    }  #'ccp_alpha': 0.04
    clf5 = GradientBoostingClassifier(**params)

    estimators = [('xgb', clf1), ('abc', clf2), ('bc', clf3), ('rf', clf4),
                  ('gbc', clf5)]

    stack_estimator = XGBClassifier(learning_rate=0.1,
                                    n_estimators=300,
                                    max_depth=5,
                                    min_child_weight=1,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8,
                                    objective='binary:logistic',
                                    nthread=40,
                                    scale_pos_weight=1,
                                    seed=27,
                                    reg_alpha=0,
                                    reg_lambda=1,
                                    early_stopping_rounds=50,
                                    show_progress=True)

    model = StackingClassifier(estimators=estimators,
                               final_estimator=stack_estimator,
                               n_jobs=-1,
                               cv=5,
                               verbose=1)

    model.fit(X, y)

    file_name = 'model_final.pkl'
    joblib.dump(model, file_name)

    return file_name