Exemple #1
0
def main():
    X = make_X(200)
    target = make_target(X)

    real_labels(X, target)

    clf_list = [
        # nn.MLPClassifier(hidden_layer_sizes=(2,), random_state=0),
        # nn.MLPClassifier(hidden_layer_sizes=(3,), random_state=0),
        # nn.MLPClassifier(hidden_layer_sizes=(4,), random_state=0),
        # nn.MLPClassifier(hidden_layer_sizes=(10,), random_state=0),
        lgbm.LGBMClassifier(n_estimators=200,random_state=0),
        xgb.XGBClassifier(n_estimators=200,max_depth=5,random_state=0)

        # nn.MLPClassifier(hidden_layer_sizes=(200,)),
        # nn.MLPClassifier(hidden_layer_sizes=(300,)),
        # nn.MLPClassifier(hidden_layer_sizes=(200, 100)),
        # xgb.XGBClassifier(n_estimators=30, max_depth=3),
        # xgb.XGBClassifier(n_estimators=5, max_depth=3),
        # ensemble.AdaBoostClassifier(n_estimators=30, random_state=0)
    ]
    for clf in clf_list:
        prd_labels(X, target, clf)

    plt.show()
Exemple #2
0
def model_xgb(train, test, label):

    xgb = sklearn.XGBClassifier(nthread=4, n_estimators=10)

    xgb.fit(train[label], train['hotel_cluster'])

    prediction = xgb.predict_proba(test[label])

    df = pd.DataFrame(prediction).transpose().tail(test[label].shape[0])

    return util.best_proba(df.as_matrix()), xgb
Exemple #3
0
def xgb_cl(x_train, x_test, y_train, y_test, max_depth):
    xg_cl = xgb.XGBClassifier(objective='binary:logistic',
                              n_estimators=10,
                              seed=123,
                              max_depth=max_depth)
    xg_cl.fit(x_train, y_train)

    # Compute the accuracy of the predictions
    preds = xg_cl.predict(x_test)
    accuracy = float(np.sum(preds == y_test)) / y_test.shape[0]
    print("xgb_cl Accuracy: %.2f%%" % (accuracy * 100.0))
    return accuracy * 100.0
Exemple #4
0
def _xgboost_gridsearch_model(
    task,
    numeric_features,
    categoric_features,
    learning_rate,
    use_dask,
    n_iter,
    scoring,
):
    param_space = {
        'clf__max_depth': randint(2, 11),
        'clf__min_child_weight': randint(1, 11),
        'clf__subsample': uniform(0.5, 0.5),
        'clf__colsample_bytree': uniform(0.5, 0.5),
        'clf__colsample_bylevel': uniform(0.5, 0.5),
        'clf__gamma': uniform(0, 1),
        'clf__reg_alpha': uniform(0, 1),
        'clf__reg_lambda': uniform(0, 10),
        'clf__base_score': uniform(0.1, 0.9),
        'clf__scale_pos_weight': uniform(0.1, 9.9),
    }

    model = (xgbsk.XGBClassifier(learning_rate=learning_rate)
             if task == 'classification' else xgbsk.XGBRegressor(
                 learning_rate=learning_rate))

    pipe = Pipeline([
        (
            'preprocessing',
            simple_proc_for_tree_algoritms(numeric_features,
                                           categoric_features),
        ),
        ('clf', model),
    ])

    if use_dask:
        from dask_ml.model_selection import RandomizedSearchCV

        return RandomizedSearchCV(pipe,
                                  param_space,
                                  n_iter=n_iter,
                                  scoring=scoring,
                                  cv=5)
    else:
        from sklearn.model_selection import RandomizedSearchCV

        return RandomizedSearchCV(pipe,
                                  param_space,
                                  n_iter=n_iter,
                                  scoring=scoring,
                                  cv=5)
Exemple #5
0
def train(train_data, train_args):
    train_data_new = []
    labels = []
    i = 0
    # split data into labels and non-labels
    for row in train_data[1:]:
        temp = []
        for x in row[15:]:
            try:
                temp.append(float(x))
            except:
                temp.append(float(-999.0))
        # create label based on relevant columns
        possible_labels = [
            row[0], row[6], row[7], row[8], row[9], row[10], row[11], row[12],
            row[13], row[14]
        ]
        labels.append(1 if any(int(label) > 0
                               for label in possible_labels) else 0)

        train_data_new.append(numpy.asarray(temp))
        i += 1

    train_data = numpy.asarray(train_data_new)

    # shuffle data to make train/test split random
    random.shuffle(train_data)

    labels = numpy.array(labels)

    # split data into train set and test set
    train_split = int(len(train_data) * 0.8)
    X_train, X_test = train_data[:train_split], train_data[train_split:]
    Y_train, Y_test = labels[:train_split], labels[train_split:]

    # create XGBClassifier model
    m = sklearn.XGBClassifier(max_depth=train_args[0], learning_rate=train_args[1], silent=train_args[2],\
                               objective=train_args[3])
    # train model
    m.fit(X=X_train, y=Y_train)

    # predict labels of test set
    preds = m.predict(X_test)

    # calculate error rate of model
    error = sum(1 for i in range(len(preds)) if int(
        float(preds[i]) > 0.5) != int(Y_test[i])) / float(len(preds))

    final_model = Model(m, error, X_train)
    return final_model
Exemple #6
0
    def run(self):
        self.output().makedirs()
        X = abhishek_feats.AbhishekFeatures().load('train', self.fold)
        y = xval_dataset.BaseDataset().load('train', self.fold).squeeze()
        cls = xgbsk.XGBClassifier(max_depth=self.max_depth.get(),
                                  learning_rate=self.eta.get(),
                                  n_estimators=self.n_est.get())
        X_tr, X_va, y_tr, y_va = model_selection.train_test_split(
            X, y, test_size=0.05)
        cls.fit(X_tr,
                y_tr,
                sample_weight=core.weight_from(y_tr),
                eval_set=[(X_va, y_va)],
                early_stopping_rounds=10)

        validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold)
        y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze()
        y_pred = cls.predict_proba(validX)[:, 1]
        score = core.score_data(y, y_pred)
        scorestr = "{:s} = {:f}".format(repr(self), score)
        print(colors.green | colors.bold | scorestr)

        valid_fn = 'cache/abhishek/xgb/maxdepth_{:d}_eta_{:f}_nest_{:d}/{:d}/valid.npy'.format(
            self.max_depth.get(), self.eta.get(), self.n_est.get(), self.fold)

        np.save(valid_fn, y_pred)

        trainX = abhishek_feats.AbhishekFeatures().load('test', None)
        pred = cls.predict_proba(trainX)[:, 1]

        test_fn = 'cache/abhishek/xgb/maxdepth_{:d}_eta_{:f}_nest_{:d}/{:d}/test.npy'.format(
            self.max_depth.get(), self.eta.get(), self.n_est.get(), self.fold)
        np.save(test_fn, pred)

        with self.output().open('w') as f:
            cols = abhishek_feats.AbhishekFeatures().load('valid',
                                                          self.fold,
                                                          as_df=True).columns
            v = pandas.Series(cls.feature_importances_,
                              index=cols).sort_values()
            v.to_csv(f)
            f.write("\n\n")
            f.write(scorestr)
            f.write("\n")
        return score
Exemple #7
0
def train_save(pred_period=20, is_high=True, is_clf=False):

    data = gen_dataset(is_high=is_high, is_clf=is_clf, pred_period=pred_period)

    if is_clf:
        _, y_train = data["train"]
        scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)

    if not is_clf:
        models = [
            lgbm.LGBMRegressor(n_estimators=300,
                               num_leaves=100,
                               max_depth=8,
                               random_state=0),
            xgb.XGBRegressor(n_estimators=300, max_depth=5, random_state=0)
        ]
    else:
        models = [
            lgbm.LGBMClassifier(n_estimators=300,
                                scale_pos_weight=0.1,
                                num_leaves=100,
                                max_depth=8,
                                random_state=0),
            xgb.XGBClassifier(
                n_estimators=300,
                scale_pos_weight=0.1,
                max_depth=5,
                random_state=0,
            )
        ]
    y_pred_list = train(data, models, is_clf=is_clf)

    # save model
    for model in models:
        save_model(model, pred_period, is_high)

    return y_pred_list
Exemple #8
0
## XGBClassifier performs the best

from model.tree import visualize
from sklearn import tree, linear_model
from sklearn import svm
# from sklearn import neural_network
from sklearn import ensemble
import xgboost
from xgboost import sklearn
# classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=7)
# classifier = linear_model.LogisticRegression(C=1.0, solver="lbfgs", multi_class="multinomial")
# classifier = svm.SVC(C=0.1, kernel='rbf')
# classifier = neural_network.MLPClassifier()
# classifier = ensemble.GradientBoostingClassifier()
# classifier = ensemble.RandomForestClassifier(criterion="entropy", max_depth=15)
classifier = sklearn.XGBClassifier(base_score=0.5, learning_rate=0.05, gamma=1.5, max_depth=7, colsample_bytree=1, subsample=0.2, n_estimators=25, seed=0, objective="multi:softprob")

# params = {'max_depth': 6, 'colsample_bytree': 1, 'n_estimators': 25, 'objective': 'multi:softprob', 'num_class': 12}
# dtrain = xgboost.DMatrix(exclude_test_input, exclude_test_target)
# classifier = xgboost.train(params=params, dtrain=dtrain, num_boost_round=1)
# xgboost.plot_importance(classifier)

classifier.fit(exclude_test_input, exclude_test_target)

# visualize
# visualize(classifier, "tree")

print "The model has been created."

# Model Assessment
# Logistic Regression
lr_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', LogisticRegression())
                     ])
'''

# List of Classifiers
classifiers = [('knn_classifier', KNeighborsClassifier(n_jobs = -1)),
               ('log_classifier', LogisticRegression(max_iter = 1000)),
               
               # Tree-based methods
               ('dt_classifier', DecisionTreeClassifier()), 
               ('rf_classifier', RandomForestClassifier()),
               ('ab_classifier', AdaBoostClassifier()),
               ('gb_classifier', GradientBoostingClassifier()),
               ('xg_classifier', xgb.XGBClassifier()),
               ('lgbm_classifier', LGBMClassifier()),
               ('cat_classifier', CatBoostClassifier()),
               
               # Support Vector Machines
               ('sv_classifier', LinearSVC()),
               ('svc_classifier', SVC(cache_size = 1000, max_iter = 5000))

               ]


for i, c in enumerate(classifiers):
    skpipes[i].append(c) 

# %%
#############################################################################