def training(processed_train_csv_file): processed_train_samples = pd.read_csv(processed_train_csv_file) processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() random.shuffle(processed_train_samples_index_lst) shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples["booking_bool"].values print "Training Random Forest Classifier" rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) rf_classifier.fit(features, labels) print "Saving the Random Forest Classifier" data_io.save_model(rf_classifier, model_name="rf_classifier.pkl") print "Training Gradient Boosting Classifier" gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print "Saving the Gradient Boosting Classifier" data_io.save_model(gb_classifier, model_name="gb_classifier.pkl") print "Training SGD Classifier" sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print "Saving the SGD Classifier" data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df_test = pd.read_csv(data_test, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=50, learning_rate=0.3, ) clf.fit(X, y) y_pred = clf.predict(X_guest) acc = accuracy_score(y_guest, y_pred) result = {"accuracy": acc} print(result) return {}, result
def classification(model_name, samples, labels, rangex, rangey): samples = np.array(samples) labels = np.array(labels) # build the model models = { "KNN": KNeighborsClassifier(), "LDA": LinearDiscriminantAnalysis(), "NB": GaussianNB(), "TREE": DecisionTreeClassifier(), "RF": RandomForestClassifier(n_estimators=20), "SVM": SVC(gamma='scale'), "PERC": Perceptron(max_iter=2000), "GB": GradientBoostingClassifier() } model = models.get(model_name) # train the model model.fit(samples, labels) print("classifier ", model, " - created") # build the matrix of results using the model result = np.zeros([rangex, rangey]) for x in range(rangex): for y in range(rangey): sample = np.array([x, y]) result[x][y] = model.predict(sample.reshape(1, -1)) return result
def grid_search(data_sets, label_sets): param_grid = [{ 'n_estimators': [10, 100], 'learning_rate': np.arange(0.01, 1, 0.03) }] # 对于不需要搜索的参数,可以固定下来 params = { 'max_depth': 4, 'min_samples_split': 2, 'loss': 'deviance', 'verbose': 0 } gbc = GradientBoostingClassifier(**params) # 将超参数配置及模型放入GridSearch中进行自动搜索 clf = GridSearchCV(gbc, param_grid, cv=5) clf.fit(data_sets, label_sets) # 获取选择的最优模型 best_model = clf.best_estimator_ # 查看选择的最优超参数配置 print(clf.best_params_) # 评分函数在这里是使用gbc的criterion print(clf.best_score_) return best_model
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingClassifier( random_state=0, n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X) try: auc_score = roc_auc_score(y, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} print(result) return {}, result
def model_pred(trainX, trainY, testX, model_type): if model_type == "rf": clf = RandomForestClassifier(n_estimators=500, n_jobs=20) clf.fit(trainX, trainY) pred = clf.predict(testX) if model_type == "gbdt": clf = GradientBoostingClassifier(n_estimators=6, learning_rate=0.9, random_state=0) clf.fit(trainX, trainY) pred = clf.predict(testX) if model_type == "fusion": prob = np.zeros(len(testX)) params = [100, 200, 300, 400, 500] for param in params: clf = RandomForestClassifier(n_estimators=param, n_jobs=20, bootstrap=True) clf.fit(trainX, trainY) prob += clf.predict(testX) ''' params = [1,2,3,4,5,6,7,8,9,10] for param in params: clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0) clf.fit(trainX,trainY) prob += clf.predict(testX) ''' pred = list(prob >= 3) print "the pos rate is:", float(sum(pred)) / len(pred) return pred
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves = gbdt.apply(X_test)[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def __init__(self, stats, data_node, ensemble_size: int, task_type: int, metric: _BaseScorer, output_dir=None, meta_learner='lightgbm', kfold=5): super().__init__(stats=stats, data_node=data_node, ensemble_method='stacking', ensemble_size=ensemble_size, task_type=task_type, metric=metric, output_dir=output_dir) self.kfold = kfold try: from lightgbm import LGBMClassifier except: warnings.warn( "Lightgbm is not imported! Stacking will use linear model instead!" ) meta_learner = 'linear' self.meta_method = meta_learner # We use Xgboost as default meta-learner if self.task_type in CLS_TASKS: if meta_learner == 'linear': from sklearn.linear_model.logistic import LogisticRegression self.meta_learner = LogisticRegression(max_iter=1000) elif meta_learner == 'gb': from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier self.meta_learner = GradientBoostingClassifier( learning_rate=0.05, subsample=0.7, max_depth=4, n_estimators=250) elif meta_learner == 'lightgbm': from lightgbm import LGBMClassifier self.meta_learner = LGBMClassifier(max_depth=4, learning_rate=0.05, n_estimators=150, n_jobs=1) else: if meta_learner == 'linear': from sklearn.linear_model import LinearRegression self.meta_learner = LinearRegression() elif meta_learner == 'lightgbm': from lightgbm import LGBMRegressor self.meta_learner = LGBMRegressor(max_depth=4, learning_rate=0.05, n_estimators=70, n_jobs=1)
def init_gbdt(self): if self.gbdt_name == 'xgboost': gbdt = XGBClassifier() elif self.gbdt_name == 'gbdt': gbdt = GradientBoostingClassifier() elif self.gbdt_name == 'lgb': gbdt = LGBMClassifier() else: print('no valid gbdt model') return gbdt
def gradientBoostingClassifier(X_train, y_train, X_dev, y_dev): print("\nPerforming Gradient Boosting.") gb = GradientBoostingClassifier(n_estimators=50, learning_rate=0.25, max_depth=5, random_state=0) gb.fit(X_train, y_train) y_pred = gb.predict(X_dev) accuracy = np.mean(y_dev == y_pred) print("Accuracy", accuracy) return gb, accuracy
def test_categorical_gb(n_samples=100000, n_features=10, p=0.7): y = numpy.random.random(n_samples) > 0.5 X = numpy.random.randint(40, size=[n_samples, n_features]) * 2 X += numpy.random.random(size=[n_samples, n_features]) > p X += y[:, numpy.newaxis] from sklearn.cross_validation import train_test_split trainX, testX, trainY, testY = train_test_split(X, y) boosters = { 'old': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5), 'cat': CommonGradientBoosting(loss=AdaLossFunction(), subsample=0.5, dtype=int, base_estimator=CategoricalTreeRegressor()), 'cat2': TreeGradientBoostingClassifier( loss=BinomialDeviance(), dtype='int', update_tree=False, base_estimator=SimpleCategoricalRegressor(n_features=2, n_attempts=3, method='cv')), 'cat3': TreeGradientBoostingClassifier( loss=BinomialDeviance(), dtype='int', update_tree=False, base_estimator=ObliviousCategoricalRegressor(n_features=10, n_categories_power=5, splits=1, pfactor=0.5)), 'cat2-2': TreeGradientBoostingClassifier( loss=BinomialDeviance(), dtype='int', update_tree=False, n_threads=2, base_estimator=SimpleCategoricalRegressor(n_features=2, n_attempts=1)), 'cat-linear': CategoricalLinearClassifier(), } for name, booster in boosters.items(): start = time.time() booster.fit(trainX, trainY) auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1]) print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
def classify_gbc(data_sets, label_sets): params = { 'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'deviance', 'verbose': 0 } clf = GradientBoostingClassifier(**params) clf.fit(data_sets, label_sets) # print(clf.score(data_sets, label_sets)) return clf
def __init__(self, data, label, task, model_name='lgb', eval_metric=None, importance_threshold=0.0): ''' :param data: DataFrame :param label: label name :param task: 任务类型, [regression, classification] :param model: ['gbdt', 'xgb', 'lgb'] :param importance_threshold, 除去小于阈值的特征 ''' self.data = data self.label = label self.task = task self.model_name = model_name self._importance_threshold = importance_threshold self.model = None # 根据任务和label的值,设置验证准则 self.eval_metric = None if model_name == 'lgb': if self.task == 'classification': self.model = lgb.LGBMClassifier(**lgb_params) if self.data[self.label].unique().shape[0] == 2: self.eval_metric = 'logloss' else: self.eval_metric = 'logloss' elif self.task == 'regression': self.model = lgb.LGBMRegressor(**lgb_params) self.eval_metric = 'l2' else: raise ValueError('Task must be either "classification" or "regression"') elif model_name == 'xgb': if self.task == 'classification': self.model = xgb.XGBClassifier(**xgb_params) if self.data[self.label].unique().shape[0] == 2: self.eval_metric = 'logloss' else: self.eval_metric = 'mlogloss' elif self.task == 'regression': self.model = xgb.XGBRegressor(**xgb_params) self.eval_metric = 'rmse' else: raise ValueError('Task must be either "classification" or "regression"') else: # gbdt if self.task == 'classification': self.model = GradientBoostingClassifier(**gbdt_params) elif self.task == 'regression': self.model = GradientBoostingRegressor(**gbdt_params) else: raise ValueError('Task must be either "classification" or "regression"') if not eval_metric: self.eval_metric = eval_metric
def gridsearch(params): tuning = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=params, scoring='accuracy', n_jobs=4, iid=False, cv=5) X_train, X_test, y_train, y_test = dataset() tuning.fit(X_train, y_train) best_params = tuning.best_params_ score = tuning.score(X_train, y_train) print(score) print(best_params) print(tuning.best_params_)
def models(): # Building and Cross-Validating the model algorithms = [] names = [] algorithms.append(('GB_Classifier', GradientBoostingClassifier())) algorithms.append(('Random_Forest', RandomForestClassifier())) algorithms.append(('ExtraTree_Classifier', ExtraTreesClassifier())) algorithms.append(('LDA_Classifier', LinearDiscriminantAnalysis())) algorithms.append(('KNN_Classification', KNeighborsClassifier())) algorithms.append(('ANN_Classification', MLPClassifier())) for name, algo in algorithms: names.append(name) return algorithms, names
def _create_estimator(self): return GradientBoostingClassifier( loss=self.loss, learning_rate=self.learning_rate, n_estimators=self.n_estimators, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_depth=self.max_depth, init=self.init, subsample=self.subsample, max_features=self.max_features, random_state=self.random_state, verbose=self.verbose, max_leaf_nodes=self.max_leaf_nodes)
def get_feature_ranking(X_train, y_train): print("feature ranking running....-> LogisticRegression") model1 = LogisticRegression(max_iter=500) rfe = RFECV(estimator=model1, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfe = rfe.fit(X_train, y_train) logr_ranking = [] for x, d in zip(rfe.ranking_, X_train.columns): logr_ranking.append([d, x]) logr_ranking = pd.DataFrame(logr_ranking, columns=['features1', 'logr']) logr_ranking.sort_values('features1', inplace=True) print("feature ranking running....-> GradientBoostingClassifier") model2 = GradientBoostingClassifier() rfe = RFECV(estimator=model2, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfe = rfe.fit(X_train, y_train) gboost_ranking = [] for x, d in zip(rfe.ranking_, X_train.columns): gboost_ranking.append([d, x]) gboost_ranking = pd.DataFrame(gboost_ranking, columns=['features2', 'gboost']) gboost_ranking.sort_values('features2', inplace=True) print("feature ranking running....-> AdaBoostClassifier") model3 = AdaBoostClassifier() rfe = RFECV(estimator=model3, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfe = rfe.fit(X_train, y_train) adaboost_ranking = [] for x, d in zip(rfe.ranking_, X_train.columns): adaboost_ranking.append([d, x]) adaboost_ranking = pd.DataFrame(adaboost_ranking, columns=['features3', 'adaboost']) adaboost_ranking.sort_values('features3', inplace=True) feature_sum = logr_ranking['logr'] + gboost_ranking[ 'gboost'] + adaboost_ranking['adaboost'] df_ranked = pd.concat([logr_ranking['features1'], feature_sum], axis=1) df_ranked.sort_values(0, inplace=True) return df_ranked
def __init__(self): n_estimators = 600 max_depth = 3 learning_rate = 0.01 self.classifier = GradientBoostingClassifier( **{ 'verbose': 1, 'n_estimators': n_estimators, 'max_depth': max_depth, 'learning_rate': learning_rate }) self.name = "gb_n{n}_md{md}_lr{lr}".format(**{ "n": n_estimators, "md": max_depth, "lr": learning_rate })
def __init__(self, stats, ensemble_size: int, task_type: int, metric: _BaseScorer, output_dir=None, meta_learner='xgboost'): super().__init__(stats=stats, ensemble_method='blending', ensemble_size=ensemble_size, task_type=task_type, metric=metric, output_dir=output_dir) try: from xgboost import XGBClassifier except: warnings.warn( "Xgboost is not imported! Blending will use linear model instead!" ) meta_learner = 'linear' # We use Xgboost as default meta-learner if self.task_type in CLS_TASKS: if meta_learner == 'linear': from sklearn.linear_model.logistic import LogisticRegression self.meta_learner = LogisticRegression(max_iter=1000) elif meta_learner == 'gb': from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier self.meta_learner = GradientBoostingClassifier( learning_rate=0.05, subsample=0.7, max_depth=4, n_estimators=250) elif meta_learner == 'xgboost': from xgboost import XGBClassifier self.meta_learner = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=150) else: if meta_learner == 'linear': from sklearn.linear_model import LinearRegression self.meta_learner = LinearRegression() elif meta_learner == 'xgboost': from xgboost import XGBRegressor self.meta_learner = XGBRegressor(max_depth=4, learning_rate=0.05, n_estimators=70)
def __init__(self, model_info, ensemble_size, task_type, metric, evaluator, model_type='ml', meta_learner='xgboost', kfold=3, save_dir=None, random_state=None): super().__init__(model_info=model_info, ensemble_size=ensemble_size, task_type=task_type, metric=metric, evaluator=evaluator, model_type=model_type, save_dir=save_dir, random_state=random_state) self.kfold = kfold # We use Xgboost as default meta-learner if self.task_type == CLASSIFICATION: if meta_learner == 'logistic': from sklearn.linear_model.logistic import LogisticRegression self.meta_learner = LogisticRegression(max_iter=1000) elif meta_learner == 'gb': from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier self.meta_learner = GradientBoostingClassifier( learning_rate=0.05, subsample=0.7, max_depth=4, n_estimators=250) elif meta_learner == 'xgboost': from xgboost import XGBClassifier self.meta_learner = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=150) elif self.task_type == REGRESSION: if meta_learner == 'linear': from sklearn.linear_model import LinearRegression self.meta_learner = LinearRegression() elif meta_learner == 'xgboost': from xgboost import XGBRegressor self.meta_learner = XGBRegressor(max_depth=4, learning_rate=0.05, n_estimators=70)
def __init__(self, verbose=1, n_estimators=5, max_depth=6, min_samples_leaf=100): self.classifier = GradientBoostingClassifier( **{ 'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format(**{ "n": n_estimators, "md": max_depth, "ms": min_samples_leaf })
def main(config="../../config.yaml", param="./gbdt_config_binary.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X_guest) try: auc_score = roc_auc_score(y_guest, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} import time print(result) print(data_guest) time.sleep(3) return {}, result
def defaultModels(df_xmat, df_ymat_cat): #### representitive common classifiers in sklearn #### classifiers = [ GaussianNB(), LogisticRegression(max_iter=500), DecisionTreeClassifier(), KNeighborsClassifier(), SVC(kernel='rbf'), AdaBoostClassifier(), BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), RandomForestClassifier(), ] cv = StratifiedKFold(n_splits=10) res = [] for clf in classifiers: print('processing...' + str(clf)[:10]) metrics_cv = [] for train_index, test_index in cv.split(df_xmat.values, df_ymat_cat): X_train = df_xmat.iloc[train_index, :].values X_test = df_xmat.iloc[test_index, :].values y_train = [df_ymat_cat[i] for i in train_index] y_test = [df_ymat_cat[i] for i in test_index] clf.fit(X_train, y_train) metrics_cv.append(clf.score(X_test, y_test)) res.append([ str(clf)[:10], np.array(metrics_cv).mean(axis=0), np.array(metrics_cv).std(axis=0) ]) return res
def apply_gradient_boosting(X_train_preprocessed, X_test_preprocessed, y_train, y_test): ##TO DO : Testing Hyper Parameters and Cross Validation print 'Applying Gradient Boosting' # Training the classifier classifier = GradientBoostingClassifier(n_estimators=100) classifier = classifier.fit(X_train_preprocessed, y_train) # Testing the classifier on Test Data y_test_pred = classifier.predict(X_test_preprocessed) #Compute Accuracy Score acc = accuracy_score(y_test, y_test_pred, normalize=True) print 'The accuracy achieved by the Gradient Boosting Classifier Model is: ', acc return classifier, acc
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingClassifier(random_state=0, n_estimators=50, learning_rate=0.3) clf.fit(X, y) y_pred = clf.predict(X) try: auc_score = roc_auc_score(y, y_pred) except: print(f"no auc score available") acc = accuracy_score(y, y_pred) result = {"accuracy": acc} print('multi result', result) return {}, result
def classify_gbc(data_sets, label_sets): # params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, # 'learning_rate': 0.01, 'loss': 'deviance', 'verbose': 0} # 网格搜索gbc最优超参数 grid_search(data_sets, label_sets) # 这是网格CV搜索出的最佳参数 100,0.52 params = { 'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.52, 'loss': 'deviance', 'verbose': 0 } clf = GradientBoostingClassifier(**params) clf.fit(data_sets, label_sets) print(clf.score(data_sets, label_sets)) return clf
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=50, learning_rate=0.3, ) clf.fit(X, y) y_pred = clf.predict(X_guest) acc = accuracy_score(y_guest, y_pred) result = {"accuracy": acc} print(result) return {}, result
def __init__(self): self.random_rate=33 clf1=SVC(C=1.0,random_state=33) clf2=XGBClassifier(n_estimators=220,learning_rate=0.2,min_child_weight=2.3) clf3=RandomForestClassifier(n_estimators=80,random_state=330,n_jobs=-1) clf4=BaggingClassifier(n_estimators=40,random_state=101) clf5=AdaBoostClassifier(n_estimators=70,learning_rate=1.5,random_state=33) clf6=GradientBoostingClassifier(n_estimators=250,learning_rate=0.23,random_state=33) clf7=XGBClassifier(n_estimators=100,learning_rate=0.12,min_child_weight=1) base_model=[ ['svc',clf1], ['xgbc',clf2], ['rfc',clf3], ['bgc',clf4], ['adbc',clf5], ['gdbc',clf6] ] self.base_models=base_model self.XGB=clf7
def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.1, random_state=42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练模型 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 filename = 'finalized_model.sav' pickle.dump(lr, open(filename, 'wb')) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1] print(y_pred_gbdtlr2)
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X_guest) try: auc_score = roc_auc_score(y_guest, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} import time print(result) print(data_guest) time.sleep(3) return {}, result