Beispiel #1
0
def run_model(read_data_datapath, save_model_path):

	# read data
	x_train, x_test, y_train, y_test = util.prepare_train_test_set(read_data_datapath, 0.001)

	# choose model
	clf = xgb.XGBRegressor(seed = 2017)

	# grid search for the best fit parameters
	param_grid = {

		'gamma': [0.0, 0.2, 0.4], # Minimum loss reduction required to make a further partition on a leaf node of the tree
		'max_depth': [3, 5, 7, 10], # in place of max_leaf_nodes
		'min_child_weight': [0.1, 1, 2], # Minimum sum of instance weight(hessian) needed in a child, in the place of min_child_leaf
		'n_estimators': [50, 100, 200, 250, 300], # Number of boosted trees to fit
		'reg_alpha': [0.1, 0.5, 1.0], # L1 regularization term on weights
		'reg_lambda': [0.1, 0.5, 1.0] # L2 regularization term on weights

	}
	CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv=4, scoring='neg_mean_squared_error')

	#CV_clf.fit(x_train[1:100,:], y_train[1:100])
	CV_clf.fit(x_train, y_train)


	# save model to pickle
	pickle.dump(CV_clf, open(save_model_path, "wb"))
	print('The best parameters are: \n %s' %CV_clf.best_params_)


	# run model and return loss
	#train_loss, test_loss = util.quick_test_model(x_train[1:100,:], x_test[1:100,:], y_train[1:100], y_test[1:100], CV_clf, regression_loss)
	train_loss, test_loss = util.quick_test_model(x_train, x_test, y_train, y_test, CV_clf, regression_loss)
	print("Train loss is %s, \n Test loss is %s  " % (train_loss, test_loss))
Beispiel #2
0
def run_model(read_data_datapath, save_model_path):
    # read data
    x_cv, x_test, y_cv, y_test = util.prepare_train_test_set(read_data_datapath,0.005)
    # choose model
    clf = Pipeline([('clf',DecisionTreeRegressor(criterion='mse',random_state=0))])
    #clf = DecisionTreeRegressor(criterion='mse',random_state=0)
    # grid search for the best fit parameters
    parameters = {
        #'clf__max_depth': [125, 100, 75, 50, 40, 30, 25, 20, 15, 10, 5],
        #'clf__min_samples_split': [2, 3, 4 ,5, 6],
        'clf__min_samples_leaf': [1, 2, 3, 4 ,5, 6]  
    }
        
    CV_clf = GridSearchCV(estimator=clf, param_grid = parameters, cv=3, scoring='neg_mean_squared_error')

    #CV_clf.fit(x_cv[1:100,:], y_cv[1:100])
    CV_clf.fit(x_cv, y_cv)
    print ('Best score: %0.3f' % CV_clf.best_score_)
    # save model to pickle
    pickle.dump(CV_clf, open(save_model_path, "wb"))
    print ('Best parameters set are: \n %s' %  CV_clf.best_estimator_.get_params())


    # run model and return loss
    train_loss, test_loss = util.quick_test_model(x_cv, x_test, y_cv, y_test, CV_clf, regression_loss)
    #train_loss, test_loss = util.quick_test_model(x_cv[1:100,:], x_test[1:100,:], y_cv[1:100], y_test[1:100], CV_clf, regression_loss)
    print("Train loss is %s, \n Test loss is %s  " % (train_loss, test_loss))
Beispiel #3
0
def number_best_feature_set(read_data_datapath):
	min_train_loss = 100
	threshold = 0
	for i in np.array([0.001, 0.002, 0.003, 0.004, 0.005, 0.006]):
		x_train, x_test, y_train, y_test = util.prepare_train_test_set(read_data_datapath, i)
		clf = xgb.XGBRegressor(seed = 2017)
		clf.fit(x_train, y_train)
		train_loss, test_loss = util.quick_test_model(x_train, x_test, y_train, y_test, clf, regression_loss)
		print (train_loss, i)
import pickle

# read the data
datapath = './data/encoded_others.pkl'
x_train, x_test, y_train, y_test = util.prepare_train_test_set(datapath)

# choose model
clf = xgb.XGBRegressor()

# grid search for the best fit parameters
param_grid = {'max_depth': [2, 4, 6], 'n_estimators': [50, 100, 200]}

CV_clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv=2)
CV_clf.fit(x_train[1:100, :], y_train[1:100])

# save model to pickle
pickle.dump(CV_clf, open("./model/xgboost_model.pkl", "wb"))

print('The best parameters are: \n %s' % CV_clf.best_params_)

# run model and return loss
train_loss, test_loss = util.quick_test_model(x_train[1:100, :],
                                              x_test[1:100, :], y_train[1:100],
                                              y_test[1:100], CV_clf,
                                              regression_loss)

print("Train loss is %s, \n Test loss is %s  " % (train_loss, test_loss))

# load model from file
# model = pickle.load(open("./model/xgboost_model.pkl", "rb"))