Ejemplo n.º 1
0
def xgb_result(x, y, testx, testy, para):
    print("----- Working on 'xgb' method...")
    #dtrain = xgb.DMatrix(x, label=y)
    #dtest = xgb.DMatrix(testx, label=testy)
    xgb0 = XGBClassifier(**para)
    #    with open('xgb.pickle','rb') as f:
    #        xgb0 = pickle.load(f)
    time0 = time.time()
    #bst = xgb.train(dtrain=dtrain,**para)
    xgb0.fit(x, y)
    train_time = time.time() - time0
    confusion, test_time = Errmodel(xgb0,
                                    x,
                                    y,
                                    testx,
                                    testy,
                                    ntree_limit=xgb0.booster().best_iteration)
    print(confusion, '\n', train_time, '\n', test_time)
    importance = sorted(xgb0.booster().get_score().items(), key=lambda x: x[1])
    result = {
        'model': xgb0,
        'confusion': confusion,
        'train_time': train_time,
        'test_time': test_time,
        'importance': importance,
        'best_iter': xgb0.booster().best_iteration
    }
    print("best_iter", xgb0.booster().best_iteration)
    return result
Ejemplo n.º 2
0
def job_function(params):
	learning_rate = params[0]
	max_depth = params[1]
	ss_cs = params[2]
	gamma = params[3]
	min_child_weight = params[4]
	reg_lambda = params[5]
	reg_alpha = params[6]

	early_stopping_rounds = 25
	if learning_rate >= 0.3:
		early_stopping_rounds = 5
	if learning_rate <= 0.03:
		early_stopping_rounds = 50

	scores = []
	for i in range(iterations_per_job):
		X_train = Xy[i][0]
		X_test = Xy[i][1]
		y_train = Xy[i][2]
		y_test = Xy[i][3]
		
		y_train2 = le.transform(y_train)   
		y_test2 = le.transform(y_test)   

		clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha)      
		clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False)
		y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit)
		score = calculate_score(y_predicted, y_test2)
		scores.append(score)

	avg_score = np.array(scores).mean()
	print(avg_score, params)
	return avg_score
Ejemplo n.º 3
0
def myThreadFunc(ThreadID):
	X_train = Xy[ThreadID][0]
	X_test = Xy[ThreadID][1]
	y_train = Xy[ThreadID][2]
	y_test = Xy[ThreadID][3]
		
	y_train2 = le.transform(y_train)   
	y_test2 = le.transform(y_test)   

	clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha)      
	clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False)
	y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit)
	score = calculate_score(y_predicted, y_test2)
	print(score, clf.booster().best_ntree_limit)
	
	train_and_test_scores[ThreadID] = score
Ejemplo n.º 4
0
def modelfit(params, x, y):
    #Fit the algorithm on the data
    print("fit")
    alg = XGBClassifier(**params)
    alg.fit(x, y, verbose=True)
    feat_imp = pd.Series(
        alg.booster().get_fscore()).sort_values(ascending=False)
    print(feat_imp)
def extract_leaf_feature(features, targets, train_indexes, params):
    model = XGBClassifier(**params)
    model.fit(features[train_indexes], targets[train_indexes])
    booster = model.booster()
    dmatrix = xgb.DMatrix(features)
    leaf = booster.predict(dmatrix, pred_leaf=True)
    encoder = sklearn.preprocessing.OneHotEncoder()
    leaf_feature = encoder.fit_transform(leaf)
    return leaf_feature
##feature importance
feature_importances = pd.DataFrame(xgb1.feature_importances_,index = x_train.columns, columns=['importance']).sort_values('importance',ascending=False)

pt=feature_importances.plot.bar


r=xgb1.predict(x_test)
l=test_df["loan_id"]
results=pd.DataFrame({"loan_id":l,"m13":r})


results.to_csv(r"D:results_xgb1_200.csv",index=False)


feat_imp = pd.Series(xgb1.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')



print(xgb1.feature_importances_)
# plot
from matplotlib import pyplot
pyplot.bar(range(len(xgb1.feature_importances_)), xgb1.feature_importances_)
pyplot.show()


from xgboost import plot_importance
plot_importance(xgb1)
pyplot.show()
Ejemplo n.º 7
0
def modelfit(train,
             labels,
             test,
             features,
             useTrainCV=True,
             cv_folds=5,
             early_stopping_rounds=50):
    model = XGBClassifier(learning_rate=0.2,
                          n_estimators=1000,
                          max_depth=5,
                          min_child_weight=1,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          objective='binary:logistic',
                          scale_pos_weight=1,
                          seed=27)

    test_percent = 0.2
    X_train, X_test, y_train, y_test = train_test_split(train,
                                                        labels,
                                                        test_size=test_percent,
                                                        random_state=23)

    xgb_param = model.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train[features], y_train)
    xgcv = xgb.DMatrix(X_test[features])
    xgtest = xgb.DMatrix(test[features])
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics='auc',
                      early_stopping_rounds=early_stopping_rounds)
    print("n_estimators=")
    print(cvresult.shape[0])
    model.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    model.fit(X_train, y_train)

    ##training predictions
    proba = model.predict_proba(X_test)
    preds = proba[:, 1]
    score = roc_auc_score(y_test, preds)
    print("Area under ROC {0}".format(score))

    #Print model report:
    #	print "\nModel Report"
    #	print "Accuracy : %.4g" % accuracy_score(y_train, preds)
    #	print "AUC Score (Train): %f" % roc_auc_score(y_train, preds)

    feat_imp = pd.Series(
        model.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    #	plt.show()

    ##test predictions
    test_proba = model.predict_proba(test)
    test_preds = test_proba[:, 1]

    return test_preds
Ejemplo n.º 8
0
Archivo: test1.py Proyecto: mircean/ML
def do_cell(task):
    df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3]
    #print('do_cell', df_train.shape, df_test.shape, x_start, y_start)

    #train
    n_places_th_local = n_places_th
    n_places_local = n_places

    if n_places != 0:
        tmp = df_train.shape[0]
        value_counts = df_train.place_id.value_counts()[0:n_places]
        df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns]
        n_places_th_local = value_counts.values[n_places - 1]
        percentage = df_train.shape[0]/tmp

    elif n_places_th != 0:
        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]
        df_train = df_train.loc[mask.values]

    else:
        n_places_th_local = 2

        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        while percentage > n_places_percentage:
            n_places_th_local += 1
            n_places_local = value_counts[value_counts >= n_places_th_local].count()
            mask = value_counts[df_train.place_id.values] >= n_places_th_local
            percentage = mask.value_counts()[True]/df_train.shape[0]

        n_places_th_local -= 1
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        df_train = df_train.loc[mask.values]


    #print(x_start, y_start, n_places_local, n_places_th_local, percentage)
        
    #test
    row_ids = df_test.index
    if 'place_id' in df_test.columns:
        df_test = df_test.drop(['place_id'], axis=1)

    le = LabelEncoder()
    y = le.fit_transform(df_train.place_id.values)
    
    X = df_train.drop(['place_id'], axis=1).values
    X_predict = df_test.values

    score = 0
    n_estimators = 0
    if xgb == 1:    
        if xgb_calculate_n_estimators == True:
            clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)

            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
   
                clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False)
                score = round(1 - clf.booster().best_score, 6)
                n_estimators = clf.booster().best_ntree_limit
            else:
                abc += 1
                xgb_options = clf.get_xgb_params()
                xgb_options['num_class'] = n_places + 1
                train_dmatrix = DMatrix(X, label=y)

                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score)

                n_estimators = cv_results.shape[0]
                score = round(1 - cv_results.values[-1][0], 6)
                std = round(cv_results.values[-1][1], 6)
        else:
            n_estimators = n_estimators_fixed

        clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)
    else:
        clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1)
        if rf_calculate_score == True:
            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
                y_train2 = le.transform(y_train)
                y_test2 = le.transform(y_test)
    
                clf.fit(X_train, y_train2)
                y_predict = clf.predict_proba(X_test)

                scores_local = []
                for i in range(X_test.shape[0]):
                    score = calculate_score_per_row(y_predict[i], y_test2[i])
                    scores_local.append(score)

                score = np.array(scores_local).mean()
            else:
                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                scores_cv = []
                for train, test in folds:
                    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

                    y_train2 = le.transform(y_train)
                    y_test2 = le.transform(y_test)
    
                    clf.fit(X_train, y_train2)
                    y_predict = clf.predict_proba(X_test)

                    scores_local = []
                    for i in range(X_test.shape[0]):
                        score = calculate_score_per_row(y_predict[i], y_test2[i])
                        scores_local.append(score)

                    score = np.array(scores_local).mean()
                    print('  ', x_start, y_start, score)
                    scores_cv.append(score)

                score = np.array(scores_cv).mean()
    
    #if few_cells == 1 or grid_search == 1:
    #    return [score, None, None]

    clf.fit(X, y)
    y_predict = clf.predict_proba(X_predict)
    ##1
    labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx])    

    print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage)

    return [score, row_ids, labels_predict]
Ejemplo n.º 9
0
nullEmbarkeds = combine[combine.Embarked.isnull()].index.values
combine['Embarked'].iloc[nullEmbarkeds] = 'C'

#构建分类模型

####offline model ###
## 交叉验证得到线下训练的准确率
'''
x_train=trainData['Fare','Age','Family','Embarked',
                'Sex','Pclass','AgeClass','SibSp','PSM',
                'Parch','FamilyBins']
'''
'''
x_train=np.concat(trainData['Fare'],trainData['Age'],trainData['Family'],
                  trainData['Embarked'],trainData['Sex'],trainData['Pclass'],
                  trainData[''])
'''
y_train = trainData['Survived']
x_train = trainData['Fare']
model = XGBClassifier(max_depth=6, n_estimator=1000, learning_rate=0.01)
scores = cross_val_score(model, x_train, y_train, cv=3)
print('accuracy:{0:.5f}'.format(np.mean(scores)))
#使用xgboost的get_fscore得到特征的重要性并排序
model.fit(x_train, y_train)
importance = model.booster().get_fscore()
sort_importance = sorted(importance.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
df = pd.DataFrame(sort_importance, columns=['feature', 'fscore'])
print(df)
Ejemplo n.º 10
0
        reg_lambda=2,
        subsample=1.0,
        colsample_bytree=1.0,
        max_delta_step=1,
        scale_pos_weight=1,
        objective='multi:softprob',
        nthread=8,
        seed=0  # ,
        # silent = False
    )
    print('training...')
    xgb_model.fit(training, label)
    print('predicting...')
    predicted = xgb_model.predict_proba(testing)
    predicted = pandas.DataFrame(predicted)
    predicted.columns = xgb_model.classes_
    # Name index column.
    predicted.index.name = 'Id'
    # Write csv.
    print('Saving prediction...')
    predicted.to_csv('Prediction.csv')
    # feature importance
    feat_imp = pandas.Series(
        xgb_model.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    matplotlib.pyplot.show()
    plot_importance(xgb_model, title='Feature importance')
    matplotlib.pyplot.show()
    plot_tree(xgb_model, num_trees=0)
    matplotlib.pyplot.show()
Ejemplo n.º 11
0
                     subsample=0.7930,
                     colsample_bytree=0.4679)

#  0.43251 for 7 models stacking
#                                  'colsample_bytree' :(0.4679)
#                                  'gamma': 4.2599),
#                                  'learning_rate': (0.0685),
#                                  'max_depth': (3),
#                                  'min_child_weight': (21.8363),
#                                  'n_estimators': (449),
#                                  'subsample': ( 0.7930),

bclf.fit(cat_blend, cy)
cprob = bclf.predict_proba(cat_btest)

bimportances = bclf.booster().get_fscore()
bsorted_imp = sorted(bimportances.items(), key=operator.itemgetter(1))
bsorted_imp.reverse()

cclf = XGBClassifier(max_depth=1,
                     learning_rate=0.0607,
                     n_estimators=303,
                     objective='multi:softprob',
                     nthread=8,
                     gamma=3.4764,
                     min_child_weight=10.8559,
                     subsample=0.5598,
                     colsample_bytree=0.6374)

# 7 models: 0.87344
#                                  'colsample_bytree' :(0.6374)
Ejemplo n.º 12
0
        for i in range(10):
            folds = StratifiedKFold(y_train, n_folds=5, shuffle=True)
            scores = []
            iterations = []
            for train_index, test_index in folds:
                X_train2, X_test2 = X_train.loc[train_index], X_train.loc[test_index]
                y_train2, y_test2 = y_train[train_index], y_train[test_index]

                X_train2, X_test2 = feature_engineering_extra(X_train2, X_test2, y_train2)

                X_train2 = csr_matrix(X_train2.values)
                X_test2 = csr_matrix(X_test2.values)

                clf.fit(X_train2, y_train2, eval_set=[(X_test2, y_test2)], eval_metric='mlogloss', early_stopping_rounds=early_stopping_rounds, verbose=False)
                #print(round(clf.booster().best_score, 6), int(clf.booster().best_ntree_limit))
                scores.append(round(clf.booster().best_score, 6))
                iterations.append(int(clf.booster().best_ntree_limit))

            scores = np.array(scores)
            iterations = np.array(iterations)
            score = scores.mean()
            scores2.append(score)
            print('score, std, iterations', score, scores.std(), iterations.mean())

        scores = np.array(scores2)
        scores = np.delete(scores, [scores.argmax(), scores.argmin()])
        print('score, std', scores.mean(), scores.std())

    if is_tt_rf == 1:
        X_train, X_test = feature_engineering(df_train, df_test, y_train)
    
Ejemplo n.º 13
0
train = np.loadtxt("train_stage2.csv")
test = np.loadtxt("pred_stage2.csv")
target = pd.read_csv('target.csv', index_col=0)
submission = pd.read_csv('SubmissionFormat.csv')

est = XGBClassifier(max_depth=7,
                    learning_rate=0.02358,
                    n_estimators=189,
                    gamma=0.07479,
                    min_child_weight=3.0666,
                    subsample=0.4970,
                    colsample_bytree=0.9517,
                    reg_alpha=0.2065,
                    objective='multi:softmax')

est.fit(train, target['status_group'])
path = 'save/est.pickle'
file = open(path, 'wb')
pickle.dump(est, file)
pred = est.predict(test)
importances = est.booster().get_fscore()
sorted_imp = sorted(importances.items(), key=operator.itemgetter(1))

output = np.chararray(len(pred), itemsize=30)
output[pred == 0] = 'functional'
output[pred == 1] = 'functional needs repair'
output[pred == 2] = 'non functional'

submission['status_group'] = output
submission.to_csv('output.csv', index=False)
                              reg_alpha=0.05,
                              reg_lambda=2,
                              subsample=1.0,
                              colsample_bytree=1.0,
                              max_delta_step=1,
                              scale_pos_weight=1,
                              objective='multi:softprob',
                              nthread=8,
                              seed=0  # ,
                              # silent = False
                              )
    print('training...')
    xgb_model.fit(training, label)
    print('predicting...')
    predicted = xgb_model.predict_proba(testing)
    predicted = pandas.DataFrame(predicted)
    predicted.columns = xgb_model.classes_
    # Name index column.
    predicted.index.name = 'Id'
    # Write csv.
    print('Saving prediction...')
    predicted.to_csv('Prediction.csv')
    # feature importance
    feat_imp = pandas.Series(xgb_model.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    matplotlib.pyplot.show()
    plot_importance(xgb_model, title='Feature importance')
    matplotlib.pyplot.show()
    plot_tree(xgb_model, num_trees=0)
    matplotlib.pyplot.show()
Ejemplo n.º 15
0
X_val = Xfold1
y_val = fold1.loc[:, 'Category']


# Now comes the time-consuming step of training xgb.

# In[3]:

xgb = XGBClassifier(**HYPER_PARAMS)
xgb.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = SCORING, verbose = 10)


# Now, we can gaze at the important features.

# In[4]:

gbdt = xgb.booster()
importance = gbdt.get_fscore()
importance = sorted(importance.items(), key = operator.itemgetter(1), reverse = True)
df=pd.DataFrame(importance, columns = ['feature', 'fscore'])
print(df)


# This provides us with a good idea as to which features are particularly relevant. 
# 
# - clearly, the timing in terms of minute, hour and year are critical
# - the collocated-crime feature scores surprisingly high
# - the spatial coordinates are useful
# - the total number of crimes in a steet is an important indicator, as well as some of the log-ratios
# - the month is not particularly essential, presumably as seasonal information can be recovered from the week
Ejemplo n.º 16
0
# Vectorize
transformer = TfidfVectorizer()
sparse_featureset = transformer.fit_transform(train_set)
df_features = pd.DataFrame(sparse_featureset.todense(),
                           columns=transformer.get_feature_names())

# Add another feature
contains_7 = pd.Series([int(("7" in s)) for s in train_set])
df_features["Contains7"] = contains_7

# SKLearn API
cls = XGBClassifier(silent=True)

cls.fit(X=df_features, y=train_targets)
print(cls.booster().get_fscore())

df_features = df_features.drop(df_features.columns[1], axis=1)
train_data = xgb.DMatrix(df_features.values, label=train_targets)

# Generic parameters
param = {
    'max_depth': 5,
    'objective': 'reg:linear',
    #'objective':'multi:softprob','num_class':2,
    'eta': .3,
    'silent': 0,
    'colsample_bytree': .2,
    'nround': 100
}