Ejemplo n.º 1
0
 def __init__(self):
     super().__init__()
     self.pipeline = Pipeline([
         ('bow', CoV(ngram_range=(2, 3))),
         #  ('tfidf', TFT()),
         ('naive bayes', GBC(n_estimators=10))
     ])
def profit_curve_main(filepath, cost_benefit):
    """Main function to test profit curve code.

    Parameters
    ----------
    filepath     : str - path to find churn.csv
    cost_benefit  : ndarray - 2D, with profit values corresponding to:
                                          -----------
                                          | TP | FP |
                                          -----------
                                          | FN | TN |
                                          -----------
    """
    X_train, X_test, y_train, y_test = get_train_test(filepath)
    models = [RF(), LR(), GBC(), SVC(probability=True)]
    model_profits = []
    for model in models:
        profits, thresholds = get_model_profits(model, cost_benefit, X_train,
                                                X_test, y_train, y_test)
        model_profits.append((model, profits, thresholds))
    plot_model_profits(model_profits)
    max_model, max_thresh, max_profit = find_best_threshold(model_profits)
    max_labeled_positives = max_model.predict_proba(X_test) >= max_thresh
    proportion_positives = max_labeled_positives.mean()
    reporting_string = ('Best model:\t\t{}\n'
                        'Best threshold:\t\t{:.2f}\n'
                        'Resulting profit:\t{}\n'
                        'Proportion positives:\t{:.2f}')
    print(
        reporting_string.format(max_model.__class__.__name__, max_thresh,
                                max_profit, proportion_positives))
Ejemplo n.º 3
0
 def create_model(self):
     self.tuned_parameters = {
         'learning_rate': [0.1, 0.15, 0.05, 0.01],
         'n_estimators': [100, 200, 300, 400],
         'max_depth': [3, 4, 5, 6]
     }
     self.model = GBC(n_iter_no_change=15)
Ejemplo n.º 4
0
def retrain(data="raw_games.csv",
            learning_rate=.1,
            n_estimators=100,
            max_depth=100):
    df = pd.read_csv("data/" + data)
    # print df.head(10)
    X = df[[str(x) for x in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]]
    y = df['winner'].apply(lambda x: winner(x))
    # print y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    try:
        with open("data/GBC_model_500.pkl", 'wb') as f:
            model = pickle.load(f)
            model.set_params(warm_start=True)
            model.fit(X_train, y_train)
    except:
        # print y_train
        model = GBC(learning_rate=learning_rate,
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    warm_start=False,
                    verbose=1).fit(X_train, y_train)

        # print X_train, "example"
    try:
        with open("data/GBC_model_500.pkl", 'wb') as f:
            pickle.dump(model, f)
    except:
        with open("data/GBC_model_500.pkl", 'wb') as f:
            pickle.dump(model, f)
Ejemplo n.º 5
0
def model(X_train, y_train, X_test=[], y_test=[], method="LR"):
    #X_train входы модели для обучения
    #X_test входы модели для тестирования
    #y_train -выходы модели для обучения
    #y_test - выводы модели для тестирования
    #method - модели машинного обучения

    if method == "LR":
        lr = LR()
    elif method == "KNC":
        lr = KNC()
    elif method == "RFC":
        lr = RFC()
    elif method == "GBC":
        lr = GBC()
    elif method == "DTC":
        lr = DTC()
    elif method == "MLPClassifier":
        lr = MLPClassifier()
    elif method == "LinearSVC":
        lr = LinearSVC()
    elif method == "SVC":
        lr = SVC()
    else:
        print("unknown method")
        return False

    if ((type(X_test) != list) & (type(y_test) != list)):
        lr = lr.fit(X_train, y_train.iloc[:, 0])
        y_mod_train = lr.predict(X_train)
        y_mod_test = lr.predict(X_test)
        #average - параметр для рассчёта f-меры (micro, macro, weighted, samples)
        f1_train = f1_score(y_train, y_mod_train, average='macro')
        f1_test = f1_score(y_test, y_mod_test, average='macro')
        out = {
            "model": lr,
            "f1_train": f1_train,
            "f1_test": f1_test,
            "y_mod_train": y_mod_train,
            "y_mod_test": y_mod_test
        }
        scores_train = cross_val_score(lr,
                                       X_train,
                                       y_train.iloc[:, 0],
                                       cv=5,
                                       scoring='f1_macro')
        for i in range(len(scores_train)):
            out["cros" + str(i)] = scores_train[i]
        return out

    else:
        #считаем кросс-валидацию
        scores_train = cross_val_score(lr,
                                       X_train,
                                       y_train.iloc[:, 0],
                                       cv=5,
                                       scoring='f1_macro')
        return np.mean(scores_train)
    return False
Ejemplo n.º 6
0
def gbmcv(n_estimators, max_depth, min_samples_leaf, min_samples_split, max_features='log2', seed=2):
    return cross_val_score(GBC(n_estimators=int(n_estimators),
                               max_depth=int(max_depth),
                               min_samples_split=int(min_samples_split),
                               min_samples_leaf=int(min_samples_leaf),
                               max_features=max_features,
                               random_state=seed),
                           X, y, 'roc_auc', cv=10).mean()
Ejemplo n.º 7
0
def train(xTrain, yTrain, metric):
    print 'goosting'
    global gboost
    gboost = GBC()
    gboost.fit(xTrain, yTrain)
    global trainResults
    trainResults = gboost.predict_proba(xTrain)[:, 1]
    i.setSuccess(trainResults, metric)
Ejemplo n.º 8
0
 def separateScale(self):
     minmaxCT = ColumnTransformer([
         ("wordFreq", MinMaxScaler(), slice(self.trueIndex(0), self.trueIndex(48))),
         ("charFreq", MinMaxScaler(), slice(self.trueIndex(48), self.trueIndex(54))),
         ("continuousCapital", MinMaxScaler(), slice(self.trueIndex(54), self.trueIndex(56))),
         ("longestCapital", MinMaxScaler(), [self.trueIndex(56)])
     ])
     return make_pipeline(minmaxCT,
         GBC(loss='deviance',
         learning_rate=0.3, n_estimators=50))
Ejemplo n.º 9
0
def gradient_boost(train_examples, train_labels, test_examples, test_labels,
                   verbose):
    model = GBC(learning_rate=0.1,
                n_estimators=500,
                subsample=1.0,
                min_samples_split=2,
                max_depth=3)
    model.fit(train_examples, train_labels)
    score = model.score(test_examples, test_labels)
    return score
Ejemplo n.º 10
0
    def __init__(self):
        '''
        This constructor is supposed to initialize data members.
        Use triple quotes for function documentation.
        '''
        self.num_train_samples = 0
        self.num_feat = 1
        self.num_labels = 1
        self.is_trained = False
        #self.mod = MultinomialNB()

        self.mod = GBC()
Ejemplo n.º 11
0
def GBCpredictor(X_train, y_train, X_test):
    ''' Input traning data ,target, and test data
    Output prabability of each label for test data'''
    from sklearn.ensemble import GradientBoostingClassifier as GBC
    from sklearn.model_selection import GridSearchCV

    # Cross validation may not be needed for random forest classifier
    model = GBC(random_state=seed)
    param_grid = {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.05, 0.1, 0.2],
        "max_depth": [2, 3, 4],
        "subsample": [0.5, 1],
        "min_samples_split": [2, 4]
    }

    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_parameters = grid_search.best_estimator_.get_params()
    print(best_parameters)

    model = GBC(n_estimators=best_parameters['n_estimators'],
                learning_rate=best_parameters['learning_rate'],
                max_depth=best_parameters['max_depth'],
                min_samples_split=best_parameters["min_samples_split"],
                random_state=seed)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy = metrics.accuracy_score(y_train, y_pred)
    logLoss = metrics.log_loss(y_train, y_pred)

    y_pred = model.predict(X_test)
    modelName = model.__class__.__name__
    accModels[modelName] = accuracy
    predictions[modelName] = y_pred

    return y_pred, accuracy
Ejemplo n.º 12
0
def gbc_objf(superParams, Xtrain=None, Ytrain=None, Xtest=None, Ytest=None):
    '''
    构造gbdt分类模型目标函数(适应度函数)
    '''
    max_depth = int(superParams[0])
    subsample = superParams[1]
    min_samples_leaf = int(superParams[2])
    min_samples_split = int(superParams[3])
    mdl = GBC(max_depth=max_depth,
              subsample=subsample,
              min_samples_leaf=min_samples_leaf,
              min_samples_split=min_samples_split)
    mdl = mdl.fit(Xtrain, Ytrain)
    Ypre = mdl.predict(Xtest)
    error = 1 - metrics.accuracy_score(Ytest, Ypre)
    return error
Ejemplo n.º 13
0
    def fit(self, params):
        self.model = GBC(learning_rate=params['learning_rate'],
                         subsample=params['subsample'],
                         n_estimators=params['n_estimators'],
                         max_depth=params['max_depth'],
                         random_state=params['random_state'])
        self.model.fit(self.data[self.attributes], self.data['interest_level'])

        try:
            print(
                "Feature Importance:",
                pd.DataFrame({
                    'attribute': self.feature_selector.attributes,
                    'importance': self.model.feature_importances_
                }).sort_values('importance'))
        except AttributeError:
            pass
Ejemplo n.º 14
0
def gbr_objf(superParams, Xtrain=None, Ytrain=None, Xtest=None, Ytest=None,
             **kwargs):
    '''
    构造gbdt回归模型目标函数(适应度函数)
    '''
    max_depth = int(superParams[0])
    subsample = superParams[1]
    min_samples_leaf = int(superParams[2])
    min_samples_split = int(superParams[3])
    mdl = GBC(max_depth=max_depth,
              subsample=subsample,
              min_samples_leaf=min_samples_leaf,
              min_samples_split=min_samples_split, **kwargs)
    mdl = mdl.fit(Xtrain, Ytrain)
    Ypre = mdl.predict(Xtest)
    vMAPE = mape(Ytest, Ypre)
    return vMAPE
Ejemplo n.º 15
0
def r_search(x, y):
    #random search params
    lr_params = {'penalty': ['l1', 'l2'], 'C': sp_rand(1e-5, .1)}
    svm_params = {'kernel': ['rbf', 'linear'], 'C': sp_rand(10, 1e5)}
    rf_params = {
        'criterion': ['gini', 'entropy'],
        'n_estimators': sp_randint(50, 200),
        'bootstrap': [True, False]
    }
    gbc_params = {
        'learning_rate': sp_rand(1e-6, 1e-1),
        'n_estimators': sp_randint(50, 200),
        'loss': ['deviance', 'exponential']
    }

    data = {}
    xs, ys = balanced_subsample(x, y)
    lst = [LR(verbose=1), RF(verbose=1), SVM(verbose=True), GBC(verbose=1)]
    names = ['LR', 'RF', 'SVM', 'GB']
    params = [lr_params, rf_params, svm_params, gbc_params]
    for idx in range(len(lst)):
        n_iter_search = 60
        start = time.time()
        rsearch = random_search(estimator=lst[idx],
                                param_distributions=params[idx],
                                n_iter=n_iter_search,
                                scoring='roc_auc',
                                fit_params=None,
                                n_jobs=1,
                                iid=True,
                                refit=True,
                                cv=5,
                                verbose=0,
                                random_state=8)
        rsearch.fit(xs, ys)
        data[names[idx]] = rsearch.cv_results_
        print(names[idx] + " results complete.")
        print("RandomizedSearchCV took %.2f seconds for %d candidates"
              " parameter settings." % ((time.time() - start), n_iter_search))
    return (data)