Ejemplo n.º 1
0
def run_mod(train_X, test_X, train_Y):
    reg = GB(max_features="auto", n_estimators=300, random_state=1)
    reg.fit(train_X, train_Y)
    pred = reg.predict_proba(test_X)
    #pred=reg.predict(test_X)         # predict class
    imp = reg.feature_importances_
    return pred, imp
def getBoostingTree(data, target):
    Y = data[target]
    X = data.drop(target, axis=1)
    #aggressive pruning!!
    model = GB(max_depth=1)
    model.fit(X, Y)
    return model
Ejemplo n.º 3
0
def model_GB(X, y):
    parameters = {
        'n_estimators': [250],
        'max_features': ['sqrt'],
        'max_depth': [25],
        'min_samples_split': [40],
        'min_samples_leaf': [225]
    }
    f1_scorer = make_scorer(fbeta_score, beta=0.5, pos_label=1)
    ks_scorer = make_scorer(ks_score, needs_proba=True)
    gb = GB()
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=2)
    grid_obj = GridSearchCV(gb,
                            param_grid=parameters,
                            scoring=ks_scorer,
                            n_jobs=3,
                            cv=cv)
    grid_obj.fit(X, y)
    gb = grid_obj.best_estimator_

    title = "Learning Curves (GB)"
    # Cross validation with 10 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    plot_learning_curve(gb,
                        title,
                        X,
                        y,
                        cv=3,
                        n_jobs=3,
                        train_sizes=np.linspace(.1, 1.0, 10))
    plt.show()
    plt.savefig('learning_curve.png')
    return gb
Ejemplo n.º 4
0
def try_params(n_iterations, params):

    n_estimators = int(round(n_iterations * trees_per_iteration))
    print "n_estimators:", n_estimators
    pprint(params)

    clf = GB(n_estimators=n_estimators, verbose=0, **params)
    clf.fit(x_train, y_train)

    p = clf.predict_proba(x_train)[:, 1]

    ll = log_loss(y_train, p)
    auc = AUC(y_train, p)
    acc = accuracy(y_train, np.round(p))

    print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
        ll, auc, acc)

    #

    p = clf.predict_proba(x_test)[:, 1]

    ll = log_loss(y_test, p)
    auc = AUC(y_test, p)
    acc = accuracy(y_test, np.round(p))

    print "# testing  | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
        ll, auc, acc)

    return {'loss': ll, 'log_loss': ll, 'auc': auc}
Ejemplo n.º 5
0
Archivo: gb.py Proyecto: j6e/hyperband
def try_params(n_iterations, params, data):
    n_estimators = int(round(n_iterations * trees_per_iteration))
    print("n_estimators:", n_estimators)
    pprint(params)

    clf = GB(n_estimators=n_estimators, verbose=0, **params)

    return train_and_eval_sklearn_regressor(clf, data)
Ejemplo n.º 6
0
    def GBClassifier(cls, ):
        n_estimators_range = np.arange(50, 500, 50)
        param_grid = {'n_estimators': n_estimators_range}

        # learning_rate_range=np.array([0.1])
        # param_grid['learning_rate']=learning_rate_range

        return cls(GB(learning_rate=0.1), par_grid_dict=param_grid)
Ejemplo n.º 7
0
def opt_model_GB(X, y):
    parameters = opt_GB(X, y)
    parameters = map(lambda i: int(i) if i > 2 else 2, parameters)
    gb = GB(max_depth=parameters[0],
            min_samples_split=parameters[1],
            min_samples_leaf=parameters[2])
    gb.fit(X, y)
    return gb
Ejemplo n.º 8
0
def gdbt_select(train,train_y,a,b,step,c):
	from sklearn.feature_selection import SelectFromModel

	score=0
	index=0
	model1=GB(random_state=0).fit(train.values, train_y.values.reshape(-1,1))
	for i in range(a,b,step):
		model = SelectFromModel(model1,threshold=i/c)
		model.fit(train,train_y)
		train1=model.transform(train)
		model =GB(random_state=0)
		cv_score=cross_val_score(model, train1, train_y, cv=cv, scoring='recall').mean()
		if score<cv_score:
			score=cv_score
			index=i/c
		print i/c,cv_score
	print
	print index,score
Ejemplo n.º 9
0
def func_GB(parameters, *args):
    parameters = map(lambda i: int(i) if i > 2 else 2, parameters)
    gb = GB(max_depth=parameters[0],
            min_samples_split=parameters[1],
            min_samples_leaf=parameters[2])
    X = args[0]
    y = args[1]
    gb.fit(X, y)
    y_pred = pd.DataFrame(gb.predict_proba(X), index=X.index)[1]
    return ks_score(y, y_pred)
Ejemplo n.º 10
0
	def get_training_models():
		return [
				("MLP_RELU", MLP(hidden_layer_sizes=(100, ), alpha=0.0001,
					activation="relu", learning_rate_init=0.001,
					tol=0.0001, max_iter=200)),
				("GB_50", GB(n_estimators=250, learning_rate=0.1, subsample=1.0,
				   max_depth=3, min_samples_split=20)),
				 ("RF_FINAL", RF(n_estimators=250, max_depth=None, min_samples_split=2,
				   bootstrap=True, n_jobs=-1)),
				]
Ejemplo n.º 11
0
 def test(self):
     """
     Test the model with best parameters found in randomSearch() or gridSearch()
     :return:
     """
     # self.clf = GB(random_state=40, n_estimators=40, max_features='sqrt', learning_rate=0.8, criterion='friedman_mse')
     self.clf = GB()
     self.clf.set_params(**self.best_parameter)
     print("*** Test Result for Gradient Boosting ***")
     ModelEvaluation.evaluateModelWithCV(self.clf,
                                         self.dataset_x,
                                         self.dataset_y,
                                         cv=10)
Ejemplo n.º 12
0
def fit_models(data):

    features = data.drop('rings', axis=1)
    target = data.rings

    models = DT(), RF(), GB(max_depth=1)

    for model in models:
        cv_results = cross_val_score(model, features, target, cv=N_FOLDS)

        print('\n==========\n', model)
        print('\ncv results\n', cv_results)
        print('\nmean cv accuracy =', cv_results.mean())
        print('std cv accuracy = ', cv_results.std())
Ejemplo n.º 13
0
def GB_classif():
    # GradientBoostingClassifier
    # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
    # sklearn.ensemble.GradientBoostingClassifier(loss=’deviance’, learning_rate=0.1,
    #     n_estimators=100, subsample=1.0, criterion=’friedman_mse’, min_samples_split=2,
    #     min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
    #     min_impurity_split=None, init=None, random_state=None, max_features=None,
    #     verbose=0, max_leaf_nodes=None, warm_start=False, presort=’auto’)
    hypers = {
        'n_estimators': 400,
        'learning_rate': 0.05,
        # 'subsample': 0.4
        'max_depth': 4
    }
    return GB(**hypers)
Ejemplo n.º 14
0
def f_classif_select(train,train_y):
	from sklearn.feature_selection import SelectKBest
	from sklearn.feature_selection import f_classif
	score=0
	index=1
	for i in range(1,train.shape[1]+1):
		model=SelectKBest(f_classif,k=i)
		train1=model.fit_transform(train,train_y)
		
		model = GB(random_state=0)
		cv_score=cross_val_score(model, train1, train_y, cv=cv, scoring='recall').mean()
		if score<cv_score:
			score=cv_score
			index=i
		print i,round(cv_score,4)
	print "______________________"
	print index,score
	model=SelectKBest(f_classif,k=index).fit(train,train_y)
	train.columns[~model.get_support()]
Ejemplo n.º 15
0
    data.Embarked=data.Embarked.fillna(data.Embarked.mode()[0])#使用众数填充
    data.Age=data.Age.fillna(data.Age.mean()) #均值填充缺失年龄
    data.Fare=data.Fare.fillna(data.Fare.mean()) #均值填充缺失Fare
    
    return data

data = pd.read_csv(r'D:\[DataSet]\1_Titanic\train.csv')
data = dataProcess(data)
feature = ['Pclass','Sex','Age','Fare','Embarked']
X = data[feature] #Feature
y = data.Survived  #Label

modelDict = {'DT':DT(),'SVC':SVC(),'GNB':GNB(),'KNN':KNN(n_neighbors=3),
             'MLP':MLP(hidden_layer_sizes=(500,)),
             'LogR':LogR(C=1.0,penalty='l1',tol=1e-6),
             'RF':RF(),'GB':GB(n_estimators=500)}

for model in modelDict.keys():
    clf = modelDict.get(model)
    scores = cross_val_score(clf, X, y, cv=5)
    print (model +' accuracy: '+'%.3f'%(scores.mean()*100)+'%')
    
clf_GB = GB(n_estimators=500)
clf_GB.fit(X,y) #模型训练
data_sub = pd.read_csv(r'D:\[DataSet]\1_Titanic\test.csv') #加载测试数据
data_sub = dataProcess(data_sub)       #处理测试数据
X_sub = data_sub[feature]  #提取测试数据特征
y_sub = clf_GB.predict(X_sub) #使用模型预测
result = pd.DataFrame({'PassengerId':data_sub['PassengerId'].as_matrix(), 
                       'Survived':y_sub}) #形成要求格式
result.to_csv(r'D:\[DataSet]\1_Titanic\submission.csv', index=False) #输出至文件
Ejemplo n.º 16
0
    def __init__(self, dataset_x, dataset_y):
        self.dataset_x = dataset_x
        self.dataset_y = dataset_y

        self.clf = GB()
        self.best_parameter = {}
Ejemplo n.º 17
0
gsRF = GridSearchCV(clf_RF,param_grid = rf_param_grid, cv=kfold, 
                    scoring="accuracy", n_jobs= 4, verbose = 1)
gsRF.fit(X,y)
rf_best = gsRF.best_estimator_


clf_SVC = SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100,200,300, 1000]}
gsSVC = GridSearchCV(clf_SVC,param_grid = svc_param_grid, cv=kfold, 
                    scoring="accuracy", n_jobs= 4, verbose = 1)
gsSVC.fit(X,y)
svm_best = gsSVC.best_estimator_

clf_GB = GB()
gb_param_grid = {'loss' : ['deviance'],
              'n_estimators' : [100,300,500],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1]}
gsGB = GridSearchCV(clf_GB,param_grid = gb_param_grid, cv=kfold, 
                    scoring="accuracy", n_jobs= 4, verbose = 1)
gsGB.fit(X,y)
gb_best = gsGB.best_estimator_

clf_MLP = MLP()
mlp_param_grid = {'hidden_layer_sizes' : [100,200,300,400,500],
              'activation' : ['relu'],
              'solver' : ['adam'],
Ejemplo n.º 18
0
test_data = test_data.dropna()
train_data = train_data.dropna()

#test_data["horsepower"]=test_data["horsepower"].apply(lambda x: 1 if x=="?" else 0)
#train_data["horsepower"]=train_data["horsepower"].apply(lambda x: 1 if x=="?" else 0)

train_X = train_data.drop(["id", "mpg", "horsepower", "car name"], axis=1)
train_y = train_data["mpg"]

val_X = test_data.drop(["id", "horsepower", "car name"], axis=1)

train_y = train_y.astype('int64')
print(train_y.dtype)  #dtypeはdetaの確認
from sklearn.linear_model import LinearRegression as LR  #線形回帰モデル
LR_model = LR()
LR_model.fit(train_X, train_y)
val_predictions = LR_model.predict(val_X)
print(LR_model.score(train_X, train_y))
from sklearn.ensemble import RandomForestClassifier as RF
RF_model = RF(n_estimators=1000, random_state=0)
RF_model.fit(train_X, train_y)
A_val_predictions = RF_model.predict(val_X)
print(RF_model.score(train_X, train_y))
from sklearn.ensemble import GradientBoostingClassifier as GB
GB_model = GB(random_state=0, learning_rate=0.01)
GB_model.fit(train_X, train_y)
val_predictions = GB_model.predict(val_X)
print(GB_model.score(train_X, train_y))
test_data["mpg"] = A_val_predictions
A_test = test_data[["id", "mpg"]]
A_test.to_csv("sample_submit.csv", index=False, header=False, encoding='cp932')
Ejemplo n.º 19
0
# coding=utf-8

from sklearn.ensemble import GradientBoostingClassifier as GB
# from sklearn.ensemble import GradientBoostingRegressor as GB
model=GB(random_state=0)

# 交叉验证
from sklearn.model_selection import StratifiedKFold
cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=False)
cross_val_score(model, train, train_y, cv=cv, scoring='precision').mean()# 'neg_mean_squared_error'


# 分类和回归--方差筛选(特征需离散化)
# var_cols 列名按照方差从小到大排序
var_cols=train.var().sort_values().index
train1=train.copy()
val1=val.copy()
i=-1
for col in var_cols:
    model=GB(random_state=0)
    model.fit(train1,train_y)
    pred=model.predict(val1)
    print i,np.sqrt(metrics.mean_squared_error(val_y,pred))
    print "_____________________________________" 
    train1=train1.drop(col,axis=1)
    val1=val1.drop(col,axis=1)
    i=i+1


		
# 分类--卡方检验
Ejemplo n.º 20
0
    'Sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q',
    'Embarked_S'
]

X = data_all.loc[data.index][feature]
y = data.Survived

modelDict = {
    'DT': DT(),
    'SVC': SVC(),
    'GNB': GNB(),
    'KNN': KNN(n_neighbors=3),
    'MLP': MLP(hidden_layer_sizes=(500, )),
    'LogR': LogR(C=1.0, penalty='l1', tol=1e-6),
    'RF': RF(n_estimators=300),
    'GB': GB(n_estimators=500)
}

for model in modelDict.keys():
    clf = modelDict.get(model)
    scores = cross_val_score(clf, X, y, cv=5)
    print(model + ' accuracy: ' + '%.3f' % (scores.mean() * 100) + '%')

votingC = VotingClassifier(estimators=[('clf_GB', GB(n_estimators=500)),
                                       ('clf_RF', RF(n_estimators=300)),
                                       ('clf_SVC', SVC(probability=True)),
                                       ('clf_MLP',
                                        MLP(hidden_layer_sizes=(500, )))],
                           voting='soft',
                           n_jobs=4)
Ejemplo n.º 21
0
# 调参后的Random Forest
model=RF(n_estimators=gsearch1.best_params_['n_estimators'],max_depth=gsearch2.best_params_['max_depth'], min_samples_leaf =gsearch3.best_params_['min_samples_leaf'], min_samples_split =gsearch3.best_params_['min_samples_split'],max_features=gsearch4.best_params_['max_features'],random_state=0)
model.fit(train,train_y)
pred=model.predict(test)
metrics.recall_score(test_y,pred)




# GBDT 调参
# scoring参数 http://scikit-learn.org/0.18/modules/model_evaluation.html#scoring-parameter
from sklearn.model_selection import GridSearchCV

# 调节参数n_estimators
param_test1 = {'n_estimators':range(75,90,1)}
gsearch1 = GridSearchCV(estimator = GB(learning_rate=0.1,random_state=0), param_grid = param_test1, scoring='recall',iid=False,cv=cv)
gsearch1.fit(train,train_y)
gsearch1.grid_scores_,gsearch1.best_score_,gsearch1.best_params_

# 调节参数max_depth和min_samples_split
param_test2 = {'max_depth':range(3,9,2), 'min_samples_split':range(2,503,100)}
gsearch2 = GridSearchCV(estimator =GB(learning_rate=0.1, n_estimators=gsearch1.best_params_['n_estimators'],random_state=0), param_grid = param_test2, scoring='recall',iid=False, cv=cv)
gsearch2.fit(train,train_y)
gsearch2.grid_scores_,gsearch2.best_score_,gsearch2.best_params_

# 调节参数min_samples_split和min_samples_leaf
param_test3 = {'min_samples_split':range(2,200,50), 'min_samples_leaf':range(1,100,10)}
gsearch3 = GridSearchCV(estimator = GB(learning_rate=0.1, n_estimators=gsearch1.best_params_['n_estimators'],max_depth=gsearch2.best_params_['max_depth'], random_state=0), param_grid = param_test3, scoring='recall',iid=False, cv=cv)
gsearch3.fit(train,train_y)
gsearch3.grid_scores_,gsearch3.best_score_,gsearch3.best_params_
Ejemplo n.º 22
0
def generateColumns(start, end):
    for i in range(start, end + 1):
        l.extend([str(i) + 'X', str(i) + 'Y'])
    return l


req = generateColumns(1, 68)

import pandas as pd
df = pd.read_csv('merge-mix.csv')

# selecting features and label as X & y respectively
X = df[req]
y = df['emotion']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.10,
                                                    random_state=42)

from sklearn.ensemble import GradientBoostingRegressor as GB
gb = GB()
gb.fit(X_train, y_train.values.ravel())

import matplotlib.pyplot as plt
plt.bar(range(X_train.shape[1]), gb.feature_importances_)
plt.xticks(range(X_train.shape[1]), req)
plt.show()
Ejemplo n.º 23
0
def model_GB(X, y):
    gb = GB(n_estimators=300)
    gb.fit(X, y)
    return gb
Ejemplo n.º 24
0
	def get_submission_models():
		return [("GB_Final", GB(n_estimators=250, learning_rate=0.1, subsample=1.0,
				   max_depth=3, min_samples_split=20)),
			   ]