class MyGradientBoostingClassifier(BaseClassifier): def __init__(self, verbose=1, n_estimators=5, max_depth=6, min_samples_leaf=100): self.classifier = GradientBoostingClassifier( **{ 'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format(**{ "n": n_estimators, "md": max_depth, "ms": min_samples_leaf }) def get_name(self): return self.name def fit(self, X, y): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self, feat_names): ipts = dict(zip(feat_names, self.classifier.feature_importances_)) return ipts
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df_test = pd.read_csv(data_test, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=50, learning_rate=0.3, ) clf.fit(X, y) y_pred = clf.predict(X_guest) acc = accuracy_score(y_guest, y_pred) result = {"accuracy": acc} print(result) return {}, result
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingClassifier( random_state=0, n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X) try: auc_score = roc_auc_score(y, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} print(result) return {}, result
def training(processed_train_csv_file): processed_train_samples = pd.read_csv(processed_train_csv_file) processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() random.shuffle(processed_train_samples_index_lst) shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples["booking_bool"].values print "Training Random Forest Classifier" rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) rf_classifier.fit(features, labels) print "Saving the Random Forest Classifier" data_io.save_model(rf_classifier, model_name="rf_classifier.pkl") print "Training Gradient Boosting Classifier" gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print "Saving the Gradient Boosting Classifier" data_io.save_model(gb_classifier, model_name="gb_classifier.pkl") print "Training SGD Classifier" sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print "Saving the SGD Classifier" data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
class Boosting(): ''' ''' def __init__(self): self.clf = GB() def fit(self, X, y): ''' :param X: :param y: :return: ''' self.clf.fit(X,y) def predict(self, X): ''' :param X: :return: ''' m = int(X.shape[0] ** (0.5)) pred = [] for I in range(m): pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray())) return pred
def gradientBoostingClassifier(X_train, y_train, X_dev, y_dev): print("\nPerforming Gradient Boosting.") gb = GradientBoostingClassifier(n_estimators=50, learning_rate=0.25, max_depth=5, random_state=0) gb.fit(X_train, y_train) y_pred = gb.predict(X_dev) accuracy = np.mean(y_dev == y_pred) print("Accuracy", accuracy) return gb, accuracy
class Boosting(): #TODO: dokumentasi def __init__(self): self.clf = GB() def fit(self,X,y): self.clf.fit(X,y) def predict(self,X): m = int(X.shape[0] ** (0.5)) pred = [] for I in range(m): pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray())) return pred
def do_training(processed_train_csv_file): ## Processed train samples reading # read saved processed train samples from the given csv file processed_train_samples = pd.read_csv(processed_train_csv_file) # inf to nan processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) # nan to 0 processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() # 之前排过序,这里shuffle一下,效果更好 random.shuffle(processed_train_samples_index_lst) # organize new train samples and targets shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples['booking_bool'].values # Model training # 1 Random Forest Classifier print("Training Random Forest Classifier") rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, n_jobs=-1, min_samples_split=10) rf_classifier.fit(features, labels) print("Saving the Random Forest Classifier") data_io.save_model(rf_classifier, model_name='rf_classifier.pkl') # 2 Gradient Boosting Classifier print("Gradient Boosting Classifier") gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print("Saving the Gradient Boosting Classifier") data_io.save_model(gb_classifier, model_name='gb_classifier.pkl') # 3 SGD Classifier print("SGD Classifier") sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print("saved the SGD Classifier") data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
def GN(pth): train_desc=np.load(pth+'/training_features.npy') nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0) idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32') # Scaling the words stdSlr = StandardScaler().fit(train_desc) train_desc = stdSlr.transform(train_desc) modelGN=GradientBoostingClassifier(n_estimators=100,learning_rate=1.0, max_depth=1, random_state=0) modelGN.fit(train_desc,np.array(train_labels)) joblib.dump((modelGN, img_classes, stdSlr), pth+"/gn-bof.pkl", compress=3) test(pth, "gn-")
def classify_gbc(data_sets, label_sets): params = { 'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'deviance', 'verbose': 0 } clf = GradientBoostingClassifier(**params) clf.fit(data_sets, label_sets) # print(clf.score(data_sets, label_sets)) return clf
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def classification(model_name, samples, labels, rangex, rangey): samples = np.array(samples) labels = np.array(labels) # build the model models = { "KNN": KNeighborsClassifier(), "LDA": LinearDiscriminantAnalysis(), "NB": GaussianNB(), "TREE": DecisionTreeClassifier(), "RF": RandomForestClassifier(n_estimators=20), "SVM": SVC(gamma='scale'), "PERC": Perceptron(max_iter=2000), "GB": GradientBoostingClassifier() } model = models.get(model_name) # train the model model.fit(samples, labels) print("classifier ", model, " - created") # build the matrix of results using the model result = np.zeros([rangex, rangey]) for x in range(rangex): for y in range(rangey): sample = np.array([x, y]) result[x][y] = model.predict(sample.reshape(1, -1)) return result
def grid_search(data_sets, label_sets): param_grid = [{ 'n_estimators': [10, 100], 'learning_rate': np.arange(0.01, 1, 0.03) }] # 对于不需要搜索的参数,可以固定下来 params = { 'max_depth': 4, 'min_samples_split': 2, 'loss': 'deviance', 'verbose': 0 } gbc = GradientBoostingClassifier(**params) # 将超参数配置及模型放入GridSearch中进行自动搜索 clf = GridSearchCV(gbc, param_grid, cv=5) clf.fit(data_sets, label_sets) # 获取选择的最优模型 best_model = clf.best_estimator_ # 查看选择的最优超参数配置 print(clf.best_params_) # 评分函数在这里是使用gbc的criterion print(clf.best_score_) return best_model
def model_pred(trainX, trainY, testX, model_type): if model_type == "rf": clf = RandomForestClassifier(n_estimators=500, n_jobs=20) clf.fit(trainX, trainY) pred = clf.predict(testX) if model_type == "gbdt": clf = GradientBoostingClassifier(n_estimators=6, learning_rate=0.9, random_state=0) clf.fit(trainX, trainY) pred = clf.predict(testX) if model_type == "fusion": prob = np.zeros(len(testX)) params = [100, 200, 300, 400, 500] for param in params: clf = RandomForestClassifier(n_estimators=param, n_jobs=20, bootstrap=True) clf.fit(trainX, trainY) prob += clf.predict(testX) ''' params = [1,2,3,4,5,6,7,8,9,10] for param in params: clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0) clf.fit(trainX,trainY) prob += clf.predict(testX) ''' pred = list(prob >= 3) print "the pos rate is:", float(sum(pred)) / len(pred) return pred
def trainGBT(requestsQ, responsesQ): while True: args = requestsQ.get() if args[0] == 'KILL': break vectors = args[1] # expected in the order of learningRate, maxTrees, minSplitSize, maxDepth hyperparams = args[2] model = GradientBoostingClassifier(learning_rate=hyperparams[0], n_estimators=hyperparams[1], min_samples_split=hyperparams[2], max_depth=hyperparams[3]) model.fit(vectors['Xtrain'], vectors['Ytrain']) score = accuracy_score(vectors['Ytest'], model.predict(vectors['Xtest'])) responsesQ.put((model, score), True) return 0
def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000): self.classifier = GradientBoostingClassifier( **{'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} )
def __init__(self, verbose=1, n_estimators=5, max_depth=6, min_samples_leaf=100): self.classifier = GradientBoostingClassifier( **{ 'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format(**{ "n": n_estimators, "md": max_depth, "ms": min_samples_leaf })
def main(config="../../config.yaml", param="./gbdt_config_binary.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X_guest) try: auc_score = roc_auc_score(y_guest, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} import time print(result) print(data_guest) time.sleep(3) return {}, result
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves = gbdt.apply(X_test)[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def __init__(self, stats, data_node, ensemble_size: int, task_type: int, metric: _BaseScorer, output_dir=None, meta_learner='lightgbm', kfold=5): super().__init__(stats=stats, data_node=data_node, ensemble_method='stacking', ensemble_size=ensemble_size, task_type=task_type, metric=metric, output_dir=output_dir) self.kfold = kfold try: from lightgbm import LGBMClassifier except: warnings.warn( "Lightgbm is not imported! Stacking will use linear model instead!" ) meta_learner = 'linear' self.meta_method = meta_learner # We use Xgboost as default meta-learner if self.task_type in CLS_TASKS: if meta_learner == 'linear': from sklearn.linear_model.logistic import LogisticRegression self.meta_learner = LogisticRegression(max_iter=1000) elif meta_learner == 'gb': from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier self.meta_learner = GradientBoostingClassifier( learning_rate=0.05, subsample=0.7, max_depth=4, n_estimators=250) elif meta_learner == 'lightgbm': from lightgbm import LGBMClassifier self.meta_learner = LGBMClassifier(max_depth=4, learning_rate=0.05, n_estimators=150, n_jobs=1) else: if meta_learner == 'linear': from sklearn.linear_model import LinearRegression self.meta_learner = LinearRegression() elif meta_learner == 'lightgbm': from lightgbm import LGBMRegressor self.meta_learner = LGBMRegressor(max_depth=4, learning_rate=0.05, n_estimators=70, n_jobs=1)
def init_gbdt(self): if self.gbdt_name == 'xgboost': gbdt = XGBClassifier() elif self.gbdt_name == 'gbdt': gbdt = GradientBoostingClassifier() elif self.gbdt_name == 'lgb': gbdt = LGBMClassifier() else: print('no valid gbdt model') return gbdt
def apply_gradient_boosting(X_train_preprocessed, X_test_preprocessed, y_train, y_test): ##TO DO : Testing Hyper Parameters and Cross Validation print 'Applying Gradient Boosting' # Training the classifier classifier = GradientBoostingClassifier(n_estimators=100) classifier = classifier.fit(X_train_preprocessed, y_train) # Testing the classifier on Test Data y_test_pred = classifier.predict(X_test_preprocessed) #Compute Accuracy Score acc = accuracy_score(y_test, y_test_pred, normalize=True) print 'The accuracy achieved by the Gradient Boosting Classifier Model is: ', acc return classifier, acc
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingClassifier(random_state=0, n_estimators=50, learning_rate=0.3) clf.fit(X, y) y_pred = clf.predict(X) try: auc_score = roc_auc_score(y, y_pred) except: print(f"no auc score available") acc = accuracy_score(y, y_pred) result = {"accuracy": acc} print('multi result', result) return {}, result
def classify_gbc(data_sets, label_sets): # params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, # 'learning_rate': 0.01, 'loss': 'deviance', 'verbose': 0} # 网格搜索gbc最优超参数 grid_search(data_sets, label_sets) # 这是网格CV搜索出的最佳参数 100,0.52 params = { 'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.52, 'loss': 'deviance', 'verbose': 0 } clf = GradientBoostingClassifier(**params) clf.fit(data_sets, label_sets) print(clf.score(data_sets, label_sets)) return clf
class MyGradientBoostingClassifier(BaseClassifier): def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000): self.classifier = GradientBoostingClassifier( **{'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} ) def get_name(self): return self.name def fit(self, X, y): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self): return self.classifier.feature_importances_
class GradientBoostingClassifierImpl(): def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001): self._hyperparams = { 'loss': loss, 'learning_rate': learning_rate, 'n_estimators': n_estimators, 'subsample': subsample, 'criterion': criterion, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_depth': max_depth, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'init': init, 'random_state': random_state, 'max_features': max_features, 'verbose': verbose, 'max_leaf_nodes': max_leaf_nodes, 'warm_start': warm_start, 'presort': presort, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'tol': tol } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def test_categorical_gb(n_samples=100000, n_features=10, p=0.7): y = numpy.random.random(n_samples) > 0.5 X = numpy.random.randint(40, size=[n_samples, n_features]) * 2 X += numpy.random.random(size=[n_samples, n_features]) > p X += y[:, numpy.newaxis] from sklearn.cross_validation import train_test_split trainX, testX, trainY, testY = train_test_split(X, y) boosters = { 'old': GradientBoostingClassifier(n_estimators=100, min_samples_split=50, max_depth=5), 'cat': CommonGradientBoosting(loss=AdaLossFunction(), subsample=0.5, dtype=int, base_estimator=CategoricalTreeRegressor()), 'cat2': TreeGradientBoostingClassifier( loss=BinomialDeviance(), dtype='int', update_tree=False, base_estimator=SimpleCategoricalRegressor(n_features=2, n_attempts=3, method='cv')), 'cat3': TreeGradientBoostingClassifier( loss=BinomialDeviance(), dtype='int', update_tree=False, base_estimator=ObliviousCategoricalRegressor(n_features=10, n_categories_power=5, splits=1, pfactor=0.5)), 'cat2-2': TreeGradientBoostingClassifier( loss=BinomialDeviance(), dtype='int', update_tree=False, n_threads=2, base_estimator=SimpleCategoricalRegressor(n_features=2, n_attempts=1)), 'cat-linear': CategoricalLinearClassifier(), } for name, booster in boosters.items(): start = time.time() booster.fit(trainX, trainY) auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1]) print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=50, learning_rate=0.3, ) clf.fit(X, y) y_pred = clf.predict(X_guest) acc = accuracy_score(y_guest, y_pred) result = {"accuracy": acc} print(result) return {}, result
def __init__(self, data, label, task, model_name='lgb', eval_metric=None, importance_threshold=0.0): ''' :param data: DataFrame :param label: label name :param task: 任务类型, [regression, classification] :param model: ['gbdt', 'xgb', 'lgb'] :param importance_threshold, 除去小于阈值的特征 ''' self.data = data self.label = label self.task = task self.model_name = model_name self._importance_threshold = importance_threshold self.model = None # 根据任务和label的值,设置验证准则 self.eval_metric = None if model_name == 'lgb': if self.task == 'classification': self.model = lgb.LGBMClassifier(**lgb_params) if self.data[self.label].unique().shape[0] == 2: self.eval_metric = 'logloss' else: self.eval_metric = 'logloss' elif self.task == 'regression': self.model = lgb.LGBMRegressor(**lgb_params) self.eval_metric = 'l2' else: raise ValueError('Task must be either "classification" or "regression"') elif model_name == 'xgb': if self.task == 'classification': self.model = xgb.XGBClassifier(**xgb_params) if self.data[self.label].unique().shape[0] == 2: self.eval_metric = 'logloss' else: self.eval_metric = 'mlogloss' elif self.task == 'regression': self.model = xgb.XGBRegressor(**xgb_params) self.eval_metric = 'rmse' else: raise ValueError('Task must be either "classification" or "regression"') else: # gbdt if self.task == 'classification': self.model = GradientBoostingClassifier(**gbdt_params) elif self.task == 'regression': self.model = GradientBoostingRegressor(**gbdt_params) else: raise ValueError('Task must be either "classification" or "regression"') if not eval_metric: self.eval_metric = eval_metric
def models(): # Building and Cross-Validating the model algorithms = [] names = [] algorithms.append(('GB_Classifier', GradientBoostingClassifier())) algorithms.append(('Random_Forest', RandomForestClassifier())) algorithms.append(('ExtraTree_Classifier', ExtraTreesClassifier())) algorithms.append(('LDA_Classifier', LinearDiscriminantAnalysis())) algorithms.append(('KNN_Classification', KNeighborsClassifier())) algorithms.append(('ANN_Classification', MLPClassifier())) for name, algo in algorithms: names.append(name) return algorithms, names
def gridsearch(params): tuning = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=params, scoring='accuracy', n_jobs=4, iid=False, cv=5) X_train, X_test, y_train, y_test = dataset() tuning.fit(X_train, y_train) best_params = tuning.best_params_ score = tuning.score(X_train, y_train) print(score) print(best_params) print(tuning.best_params_)
def _create_estimator(self): return GradientBoostingClassifier( loss=self.loss, learning_rate=self.learning_rate, n_estimators=self.n_estimators, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_depth=self.max_depth, init=self.init, subsample=self.subsample, max_features=self.max_features, random_state=self.random_state, verbose=self.verbose, max_leaf_nodes=self.max_leaf_nodes)
def gbm_model_train(self,train,targets,run_gs): if run_gs==False: gbm0 = GradientBoostingClassifier(random_state=0) gbm0.fit(train,targets) cv_result=cross_val_score(gbm0,train,targets,cv=5) print('gradient boosting cross validation score is ',cv_result.mean() ) else: #using grid search CV with random forest classfier rf=RandomForestClassifier(random_state=0) parameters = { 'max_depth' : [6, 8,10], 'n_estimators': [50, 100,200,400], 'max_features': ['sqrt', 'auto', 'log2'], 'min_samples_split': [3,5, 10], 'min_samples_leaf': [5, 10, 15], 'bootstrap': [True, False], 'criterion':['gini','entropy'] } grid_sear=GridSearchCV(rf,param_grid=parameters,scoring='accuracy',cv=10) grid=grid_sear.fit(train,targets) print(grid.best_score_) print(grid.best_params_)
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingClassifier( n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1) clf.fit(X, y) y_prob = clf.predict(X_guest) try: auc_score = roc_auc_score(y_guest, y_prob) except: print(f"no auc score available") return result = {"auc": auc_score} import time print(result) print(data_guest) time.sleep(3) return {}, result
def get_feature_ranking(X_train, y_train): print("feature ranking running....-> LogisticRegression") model1 = LogisticRegression(max_iter=500) rfe = RFECV(estimator=model1, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfe = rfe.fit(X_train, y_train) logr_ranking = [] for x, d in zip(rfe.ranking_, X_train.columns): logr_ranking.append([d, x]) logr_ranking = pd.DataFrame(logr_ranking, columns=['features1', 'logr']) logr_ranking.sort_values('features1', inplace=True) print("feature ranking running....-> GradientBoostingClassifier") model2 = GradientBoostingClassifier() rfe = RFECV(estimator=model2, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfe = rfe.fit(X_train, y_train) gboost_ranking = [] for x, d in zip(rfe.ranking_, X_train.columns): gboost_ranking.append([d, x]) gboost_ranking = pd.DataFrame(gboost_ranking, columns=['features2', 'gboost']) gboost_ranking.sort_values('features2', inplace=True) print("feature ranking running....-> AdaBoostClassifier") model3 = AdaBoostClassifier() rfe = RFECV(estimator=model3, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfe = rfe.fit(X_train, y_train) adaboost_ranking = [] for x, d in zip(rfe.ranking_, X_train.columns): adaboost_ranking.append([d, x]) adaboost_ranking = pd.DataFrame(adaboost_ranking, columns=['features3', 'adaboost']) adaboost_ranking.sort_values('features3', inplace=True) feature_sum = logr_ranking['logr'] + gboost_ranking[ 'gboost'] + adaboost_ranking['adaboost'] df_ranked = pd.concat([logr_ranking['features1'], feature_sum], axis=1) df_ranked.sort_values(0, inplace=True) return df_ranked
def trainModel(param,feat_folder,feat_name): #read data from folder print 'now we read data from folder:%s'%(feat_folder) #start cv print 'now we need to generate cross_validation' accuracy_cv = [] for i in range(0,2): print 'this is the run:%d cross-validation'%(i+1) testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1))) #if we use xgboost to train model ,we need to use svmlib format if param['task'] in ['regression']: #with xgb we will dump the file with CV,and we will read data train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) watchlist = [(train_data,'train'),(valid_data,'valid')] bst = xgb.train(param,train_data,int(param['num_round']),watchlist) pred = bst.predict(valid_data) elif param['task'] in ['clf_skl_lr']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() clf = LogisticRegression() clf.fit(train_data,train_label) pred = clf.predict(test_data) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) rf = RandomForestRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(train_data, test_label) pred = rf.predict(test_data) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(train_data,test_label) pred = etr.predict(test_data) elif param['task'] in ['reg_skl_gbm'] : train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data,train_label) pred = gbm.predict(test_data) elif param['task'] in ['reg_skl_ridge']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data,train_label) predraw = ridge.predict(test_data) print predraw predrank = predraw.argsort().argsort() trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1))) cdf = creatCDF(train, trainIndex) pred = getScore(predrank,cdf) print pred """ elif param['task'] in ['regression']: elif param['task'] in ['reg_skl_gbm'] : gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data[feat_names],train_data['cid']) pred = gbm.predict(valid_data[feat_names]) elif param['task'] in ['reg_skl_ridge']: feat_names.remove('cid') ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data[feat_names],train_data['cid']) pred = ridge.predict(valid_data[feat_names]) """ #now we use the the accuracy to limit our model acc = accuracy_model(pred,train.iloc[testIndex]['cid']) print "the model accurary:%s"%(acc) accuracy_cv.append(acc) #here we will count the accuracy_cv_mean = np.mean(accuracy_cv) accuracy_cv_std = np.std(accuracy_cv) print 'the accuracy for %.6f'%(accuracy_cv_mean) return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
import matplotlib.pyplot as plt import numpy as np import output_coursera as coursera from sklearn.metrics import log_loss from sklearn.cross_validation import train_test_split from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier from sklearn.ensemble.forest import RandomForestClassifier data = pandas.read_csv('gbm-data.csv') X = data[data.columns[1:]].values y = data[data.columns[0]].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) clf = GradientBoostingClassifier(n_estimators=250, random_state=241, verbose=True) sigmoid = np.vectorize(lambda x: (1 / (1 + math.exp(-x)))) coursera.output('overfitting.txt', 'overfitting') looses = {} def plot_score(test_predictions, y_test, train_predictions, y_train, color, learning_rate): test_loss = [log_loss(y_test, pred) for pred in test_predictions] train_loss = [log_loss(y_train, pred) for pred in train_predictions] plt.plot(test_loss, color, linewidth=2) plt.plot(train_loss, color+'--', linewidth=2) looses[learning_rate] = test_loss
# <codecell> df2 = df[selected] # <codecell> X, y = shuffle(df2[possible_features], df2.bad) offset = int(X.shape[0] * 0.9) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] # <codecell> params = {'init': LogOddsEstimator(), 'n_estimators': 5, 'max_depth': 6, 'learning_rate': 0.1, 'loss': 'bdeviance'} clf = GradientBoostingClassifier(**params) # <codecell> clf = clf.fit(X_train, y_train) predicted = clf.predict(X_test) # <codecell> clf.feature_importances_ # <codecell> print "Mean Squared Error" mse = mean_squared_error(y_test, predicted) print("MSE: %.4f" % mse)
features_data_count = X.count() missing = features_data_count[features_data_count < matches_count] missing = missing.apply(lambda x: "missing {} of {}".format(matches_count - x, matches_count)) print(missing) X = X.fillna(0) # ===================== GradientBoosting ============================== size = 0 score = 0 for forest_size in [10, 20, 30, 50, 150, 300]: start_time = datetime.datetime.now() clf = GradientBoostingClassifier(n_estimators=forest_size) k_folder = KFold(X.shape[0], n_folds=5, shuffle=True) scores = cross_val_score(clf, X=X, y=y, cv=k_folder, scoring='roc_auc') current_score = np.mean(scores) print("for {} trees mean score has been {} and time elapsed {}".format(forest_size, current_score, datetime.datetime.now() - start_time)) if score < current_score: score = current_score size = forest_size print("best score was for {} forest size: {}".format(size, score)) # ===================LogisticRegression================= features = X def train_logistic(features, target, label): scaler = StandardScaler() features = scaler.fit_transform(features)