def main(): X = make_X(200) target = make_target(X) real_labels(X, target) clf_list = [ # nn.MLPClassifier(hidden_layer_sizes=(2,), random_state=0), # nn.MLPClassifier(hidden_layer_sizes=(3,), random_state=0), # nn.MLPClassifier(hidden_layer_sizes=(4,), random_state=0), # nn.MLPClassifier(hidden_layer_sizes=(10,), random_state=0), lgbm.LGBMClassifier(n_estimators=200,random_state=0), xgb.XGBClassifier(n_estimators=200,max_depth=5,random_state=0) # nn.MLPClassifier(hidden_layer_sizes=(200,)), # nn.MLPClassifier(hidden_layer_sizes=(300,)), # nn.MLPClassifier(hidden_layer_sizes=(200, 100)), # xgb.XGBClassifier(n_estimators=30, max_depth=3), # xgb.XGBClassifier(n_estimators=5, max_depth=3), # ensemble.AdaBoostClassifier(n_estimators=30, random_state=0) ] for clf in clf_list: prd_labels(X, target, clf) plt.show()
def model_xgb(train, test, label): xgb = sklearn.XGBClassifier(nthread=4, n_estimators=10) xgb.fit(train[label], train['hotel_cluster']) prediction = xgb.predict_proba(test[label]) df = pd.DataFrame(prediction).transpose().tail(test[label].shape[0]) return util.best_proba(df.as_matrix()), xgb
def xgb_cl(x_train, x_test, y_train, y_test, max_depth): xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123, max_depth=max_depth) xg_cl.fit(x_train, y_train) # Compute the accuracy of the predictions preds = xg_cl.predict(x_test) accuracy = float(np.sum(preds == y_test)) / y_test.shape[0] print("xgb_cl Accuracy: %.2f%%" % (accuracy * 100.0)) return accuracy * 100.0
def _xgboost_gridsearch_model( task, numeric_features, categoric_features, learning_rate, use_dask, n_iter, scoring, ): param_space = { 'clf__max_depth': randint(2, 11), 'clf__min_child_weight': randint(1, 11), 'clf__subsample': uniform(0.5, 0.5), 'clf__colsample_bytree': uniform(0.5, 0.5), 'clf__colsample_bylevel': uniform(0.5, 0.5), 'clf__gamma': uniform(0, 1), 'clf__reg_alpha': uniform(0, 1), 'clf__reg_lambda': uniform(0, 10), 'clf__base_score': uniform(0.1, 0.9), 'clf__scale_pos_weight': uniform(0.1, 9.9), } model = (xgbsk.XGBClassifier(learning_rate=learning_rate) if task == 'classification' else xgbsk.XGBRegressor( learning_rate=learning_rate)) pipe = Pipeline([ ( 'preprocessing', simple_proc_for_tree_algoritms(numeric_features, categoric_features), ), ('clf', model), ]) if use_dask: from dask_ml.model_selection import RandomizedSearchCV return RandomizedSearchCV(pipe, param_space, n_iter=n_iter, scoring=scoring, cv=5) else: from sklearn.model_selection import RandomizedSearchCV return RandomizedSearchCV(pipe, param_space, n_iter=n_iter, scoring=scoring, cv=5)
def train(train_data, train_args): train_data_new = [] labels = [] i = 0 # split data into labels and non-labels for row in train_data[1:]: temp = [] for x in row[15:]: try: temp.append(float(x)) except: temp.append(float(-999.0)) # create label based on relevant columns possible_labels = [ row[0], row[6], row[7], row[8], row[9], row[10], row[11], row[12], row[13], row[14] ] labels.append(1 if any(int(label) > 0 for label in possible_labels) else 0) train_data_new.append(numpy.asarray(temp)) i += 1 train_data = numpy.asarray(train_data_new) # shuffle data to make train/test split random random.shuffle(train_data) labels = numpy.array(labels) # split data into train set and test set train_split = int(len(train_data) * 0.8) X_train, X_test = train_data[:train_split], train_data[train_split:] Y_train, Y_test = labels[:train_split], labels[train_split:] # create XGBClassifier model m = sklearn.XGBClassifier(max_depth=train_args[0], learning_rate=train_args[1], silent=train_args[2],\ objective=train_args[3]) # train model m.fit(X=X_train, y=Y_train) # predict labels of test set preds = m.predict(X_test) # calculate error rate of model error = sum(1 for i in range(len(preds)) if int( float(preds[i]) > 0.5) != int(Y_test[i])) / float(len(preds)) final_model = Model(m, error, X_train) return final_model
def run(self): self.output().makedirs() X = abhishek_feats.AbhishekFeatures().load('train', self.fold) y = xval_dataset.BaseDataset().load('train', self.fold).squeeze() cls = xgbsk.XGBClassifier(max_depth=self.max_depth.get(), learning_rate=self.eta.get(), n_estimators=self.n_est.get()) X_tr, X_va, y_tr, y_va = model_selection.train_test_split( X, y, test_size=0.05) cls.fit(X_tr, y_tr, sample_weight=core.weight_from(y_tr), eval_set=[(X_va, y_va)], early_stopping_rounds=10) validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold) y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze() y_pred = cls.predict_proba(validX)[:, 1] score = core.score_data(y, y_pred) scorestr = "{:s} = {:f}".format(repr(self), score) print(colors.green | colors.bold | scorestr) valid_fn = 'cache/abhishek/xgb/maxdepth_{:d}_eta_{:f}_nest_{:d}/{:d}/valid.npy'.format( self.max_depth.get(), self.eta.get(), self.n_est.get(), self.fold) np.save(valid_fn, y_pred) trainX = abhishek_feats.AbhishekFeatures().load('test', None) pred = cls.predict_proba(trainX)[:, 1] test_fn = 'cache/abhishek/xgb/maxdepth_{:d}_eta_{:f}_nest_{:d}/{:d}/test.npy'.format( self.max_depth.get(), self.eta.get(), self.n_est.get(), self.fold) np.save(test_fn, pred) with self.output().open('w') as f: cols = abhishek_feats.AbhishekFeatures().load('valid', self.fold, as_df=True).columns v = pandas.Series(cls.feature_importances_, index=cols).sort_values() v.to_csv(f) f.write("\n\n") f.write(scorestr) f.write("\n") return score
def train_save(pred_period=20, is_high=True, is_clf=False): data = gen_dataset(is_high=is_high, is_clf=is_clf, pred_period=pred_period) if is_clf: _, y_train = data["train"] scale_pos_weight = sum(y_train == 0) / sum(y_train == 1) if not is_clf: models = [ lgbm.LGBMRegressor(n_estimators=300, num_leaves=100, max_depth=8, random_state=0), xgb.XGBRegressor(n_estimators=300, max_depth=5, random_state=0) ] else: models = [ lgbm.LGBMClassifier(n_estimators=300, scale_pos_weight=0.1, num_leaves=100, max_depth=8, random_state=0), xgb.XGBClassifier( n_estimators=300, scale_pos_weight=0.1, max_depth=5, random_state=0, ) ] y_pred_list = train(data, models, is_clf=is_clf) # save model for model in models: save_model(model, pred_period, is_high) return y_pred_list
## XGBClassifier performs the best from model.tree import visualize from sklearn import tree, linear_model from sklearn import svm # from sklearn import neural_network from sklearn import ensemble import xgboost from xgboost import sklearn # classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=7) # classifier = linear_model.LogisticRegression(C=1.0, solver="lbfgs", multi_class="multinomial") # classifier = svm.SVC(C=0.1, kernel='rbf') # classifier = neural_network.MLPClassifier() # classifier = ensemble.GradientBoostingClassifier() # classifier = ensemble.RandomForestClassifier(criterion="entropy", max_depth=15) classifier = sklearn.XGBClassifier(base_score=0.5, learning_rate=0.05, gamma=1.5, max_depth=7, colsample_bytree=1, subsample=0.2, n_estimators=25, seed=0, objective="multi:softprob") # params = {'max_depth': 6, 'colsample_bytree': 1, 'n_estimators': 25, 'objective': 'multi:softprob', 'num_class': 12} # dtrain = xgboost.DMatrix(exclude_test_input, exclude_test_target) # classifier = xgboost.train(params=params, dtrain=dtrain, num_boost_round=1) # xgboost.plot_importance(classifier) classifier.fit(exclude_test_input, exclude_test_target) # visualize # visualize(classifier, "tree") print "The model has been created." # Model Assessment
# Logistic Regression lr_pipe = Pipeline([('scaler', StandardScaler()), ('classifier', LogisticRegression()) ]) ''' # List of Classifiers classifiers = [('knn_classifier', KNeighborsClassifier(n_jobs = -1)), ('log_classifier', LogisticRegression(max_iter = 1000)), # Tree-based methods ('dt_classifier', DecisionTreeClassifier()), ('rf_classifier', RandomForestClassifier()), ('ab_classifier', AdaBoostClassifier()), ('gb_classifier', GradientBoostingClassifier()), ('xg_classifier', xgb.XGBClassifier()), ('lgbm_classifier', LGBMClassifier()), ('cat_classifier', CatBoostClassifier()), # Support Vector Machines ('sv_classifier', LinearSVC()), ('svc_classifier', SVC(cache_size = 1000, max_iter = 5000)) ] for i, c in enumerate(classifiers): skpipes[i].append(c) # %% #############################################################################