def __init__(self): super().__init__() self.pipeline = Pipeline([ ('bow', CoV(ngram_range=(2, 3))), # ('tfidf', TFT()), ('naive bayes', GBC(n_estimators=10)) ])
def profit_curve_main(filepath, cost_benefit): """Main function to test profit curve code. Parameters ---------- filepath : str - path to find churn.csv cost_benefit : ndarray - 2D, with profit values corresponding to: ----------- | TP | FP | ----------- | FN | TN | ----------- """ X_train, X_test, y_train, y_test = get_train_test(filepath) models = [RF(), LR(), GBC(), SVC(probability=True)] model_profits = [] for model in models: profits, thresholds = get_model_profits(model, cost_benefit, X_train, X_test, y_train, y_test) model_profits.append((model, profits, thresholds)) plot_model_profits(model_profits) max_model, max_thresh, max_profit = find_best_threshold(model_profits) max_labeled_positives = max_model.predict_proba(X_test) >= max_thresh proportion_positives = max_labeled_positives.mean() reporting_string = ('Best model:\t\t{}\n' 'Best threshold:\t\t{:.2f}\n' 'Resulting profit:\t{}\n' 'Proportion positives:\t{:.2f}') print( reporting_string.format(max_model.__class__.__name__, max_thresh, max_profit, proportion_positives))
def create_model(self): self.tuned_parameters = { 'learning_rate': [0.1, 0.15, 0.05, 0.01], 'n_estimators': [100, 200, 300, 400], 'max_depth': [3, 4, 5, 6] } self.model = GBC(n_iter_no_change=15)
def retrain(data="raw_games.csv", learning_rate=.1, n_estimators=100, max_depth=100): df = pd.read_csv("data/" + data) # print df.head(10) X = df[[str(x) for x in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]] y = df['winner'].apply(lambda x: winner(x)) # print y X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) try: with open("data/GBC_model_500.pkl", 'wb') as f: model = pickle.load(f) model.set_params(warm_start=True) model.fit(X_train, y_train) except: # print y_train model = GBC(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, warm_start=False, verbose=1).fit(X_train, y_train) # print X_train, "example" try: with open("data/GBC_model_500.pkl", 'wb') as f: pickle.dump(model, f) except: with open("data/GBC_model_500.pkl", 'wb') as f: pickle.dump(model, f)
def model(X_train, y_train, X_test=[], y_test=[], method="LR"): #X_train входы модели для обучения #X_test входы модели для тестирования #y_train -выходы модели для обучения #y_test - выводы модели для тестирования #method - модели машинного обучения if method == "LR": lr = LR() elif method == "KNC": lr = KNC() elif method == "RFC": lr = RFC() elif method == "GBC": lr = GBC() elif method == "DTC": lr = DTC() elif method == "MLPClassifier": lr = MLPClassifier() elif method == "LinearSVC": lr = LinearSVC() elif method == "SVC": lr = SVC() else: print("unknown method") return False if ((type(X_test) != list) & (type(y_test) != list)): lr = lr.fit(X_train, y_train.iloc[:, 0]) y_mod_train = lr.predict(X_train) y_mod_test = lr.predict(X_test) #average - параметр для рассчёта f-меры (micro, macro, weighted, samples) f1_train = f1_score(y_train, y_mod_train, average='macro') f1_test = f1_score(y_test, y_mod_test, average='macro') out = { "model": lr, "f1_train": f1_train, "f1_test": f1_test, "y_mod_train": y_mod_train, "y_mod_test": y_mod_test } scores_train = cross_val_score(lr, X_train, y_train.iloc[:, 0], cv=5, scoring='f1_macro') for i in range(len(scores_train)): out["cros" + str(i)] = scores_train[i] return out else: #считаем кросс-валидацию scores_train = cross_val_score(lr, X_train, y_train.iloc[:, 0], cv=5, scoring='f1_macro') return np.mean(scores_train) return False
def gbmcv(n_estimators, max_depth, min_samples_leaf, min_samples_split, max_features='log2', seed=2): return cross_val_score(GBC(n_estimators=int(n_estimators), max_depth=int(max_depth), min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf), max_features=max_features, random_state=seed), X, y, 'roc_auc', cv=10).mean()
def train(xTrain, yTrain, metric): print 'goosting' global gboost gboost = GBC() gboost.fit(xTrain, yTrain) global trainResults trainResults = gboost.predict_proba(xTrain)[:, 1] i.setSuccess(trainResults, metric)
def separateScale(self): minmaxCT = ColumnTransformer([ ("wordFreq", MinMaxScaler(), slice(self.trueIndex(0), self.trueIndex(48))), ("charFreq", MinMaxScaler(), slice(self.trueIndex(48), self.trueIndex(54))), ("continuousCapital", MinMaxScaler(), slice(self.trueIndex(54), self.trueIndex(56))), ("longestCapital", MinMaxScaler(), [self.trueIndex(56)]) ]) return make_pipeline(minmaxCT, GBC(loss='deviance', learning_rate=0.3, n_estimators=50))
def gradient_boost(train_examples, train_labels, test_examples, test_labels, verbose): model = GBC(learning_rate=0.1, n_estimators=500, subsample=1.0, min_samples_split=2, max_depth=3) model.fit(train_examples, train_labels) score = model.score(test_examples, test_labels) return score
def __init__(self): ''' This constructor is supposed to initialize data members. Use triple quotes for function documentation. ''' self.num_train_samples = 0 self.num_feat = 1 self.num_labels = 1 self.is_trained = False #self.mod = MultinomialNB() self.mod = GBC()
def GBCpredictor(X_train, y_train, X_test): ''' Input traning data ,target, and test data Output prabability of each label for test data''' from sklearn.ensemble import GradientBoostingClassifier as GBC from sklearn.model_selection import GridSearchCV # Cross validation may not be needed for random forest classifier model = GBC(random_state=seed) param_grid = { "n_estimators": [50, 100, 200], "learning_rate": [0.05, 0.1, 0.2], "max_depth": [2, 3, 4], "subsample": [0.5, 1], "min_samples_split": [2, 4] } grid_search = GridSearchCV(model, param_grid, cv=5) grid_search.fit(X_train, y_train) best_parameters = grid_search.best_estimator_.get_params() print(best_parameters) model = GBC(n_estimators=best_parameters['n_estimators'], learning_rate=best_parameters['learning_rate'], max_depth=best_parameters['max_depth'], min_samples_split=best_parameters["min_samples_split"], random_state=seed) model.fit(X_train, y_train) y_pred = model.predict(X_train) accuracy = metrics.accuracy_score(y_train, y_pred) logLoss = metrics.log_loss(y_train, y_pred) y_pred = model.predict(X_test) modelName = model.__class__.__name__ accModels[modelName] = accuracy predictions[modelName] = y_pred return y_pred, accuracy
def gbc_objf(superParams, Xtrain=None, Ytrain=None, Xtest=None, Ytest=None): ''' 构造gbdt分类模型目标函数(适应度函数) ''' max_depth = int(superParams[0]) subsample = superParams[1] min_samples_leaf = int(superParams[2]) min_samples_split = int(superParams[3]) mdl = GBC(max_depth=max_depth, subsample=subsample, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split) mdl = mdl.fit(Xtrain, Ytrain) Ypre = mdl.predict(Xtest) error = 1 - metrics.accuracy_score(Ytest, Ypre) return error
def fit(self, params): self.model = GBC(learning_rate=params['learning_rate'], subsample=params['subsample'], n_estimators=params['n_estimators'], max_depth=params['max_depth'], random_state=params['random_state']) self.model.fit(self.data[self.attributes], self.data['interest_level']) try: print( "Feature Importance:", pd.DataFrame({ 'attribute': self.feature_selector.attributes, 'importance': self.model.feature_importances_ }).sort_values('importance')) except AttributeError: pass
def gbr_objf(superParams, Xtrain=None, Ytrain=None, Xtest=None, Ytest=None, **kwargs): ''' 构造gbdt回归模型目标函数(适应度函数) ''' max_depth = int(superParams[0]) subsample = superParams[1] min_samples_leaf = int(superParams[2]) min_samples_split = int(superParams[3]) mdl = GBC(max_depth=max_depth, subsample=subsample, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, **kwargs) mdl = mdl.fit(Xtrain, Ytrain) Ypre = mdl.predict(Xtest) vMAPE = mape(Ytest, Ypre) return vMAPE
def r_search(x, y): #random search params lr_params = {'penalty': ['l1', 'l2'], 'C': sp_rand(1e-5, .1)} svm_params = {'kernel': ['rbf', 'linear'], 'C': sp_rand(10, 1e5)} rf_params = { 'criterion': ['gini', 'entropy'], 'n_estimators': sp_randint(50, 200), 'bootstrap': [True, False] } gbc_params = { 'learning_rate': sp_rand(1e-6, 1e-1), 'n_estimators': sp_randint(50, 200), 'loss': ['deviance', 'exponential'] } data = {} xs, ys = balanced_subsample(x, y) lst = [LR(verbose=1), RF(verbose=1), SVM(verbose=True), GBC(verbose=1)] names = ['LR', 'RF', 'SVM', 'GB'] params = [lr_params, rf_params, svm_params, gbc_params] for idx in range(len(lst)): n_iter_search = 60 start = time.time() rsearch = random_search(estimator=lst[idx], param_distributions=params[idx], n_iter=n_iter_search, scoring='roc_auc', fit_params=None, n_jobs=1, iid=True, refit=True, cv=5, verbose=0, random_state=8) rsearch.fit(xs, ys) data[names[idx]] = rsearch.cv_results_ print(names[idx] + " results complete.") print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time.time() - start), n_iter_search)) return (data)