Ejemplo n.º 1
0
    def test_sparse(self):

        params = {
            'exec_path': path_to_exec,
            'num_iterations': 1000,
            'verbose': False,
            'min_data_in_leaf': 1,
            'learning_rate': 0.1,
            'num_leaves': 5
        }

        clfs = [
            [sps.csr_matrix(X), Y, 'classification',
             GBMClassifier(**params)],
            [sps.csr_matrix(Xreg), Yreg, 'regression',
             GBMRegressor(**params)],
        ]

        for x, y, name, clf in clfs:
            clf.fit(x, y)

            if name == 'classification':
                score = metrics.accuracy_score(y, clf.predict(x))
                assert score > 0.9
            else:
                score = metrics.mean_squared_error(y, clf.predict(x))
                assert score < 1.
Ejemplo n.º 2
0
    def test_early_stopping(self):

        cv_params = dict(test_size=test_size, random_state=seed)
        xtr, xte, ytr, yte = model_selection.train_test_split(
            X, Y, **cv_params)
        xtr_reg, xte_reg, ytr_reg, yte_reg = model_selection.train_test_split(
            X, Y, **cv_params)

        params = dict(exec_path=path_to_exec,
                      num_iterations=10000,
                      min_data_in_leaf=3,
                      learning_rate=0.01,
                      num_leaves=2,
                      early_stopping_round=2)
        clfs = [
            [
                xtr_reg, ytr_reg, xte_reg, yte_reg, 'regression',
                GBMRegressor(boosting_type='gbdt', **params)
            ],
            [
                xtr_reg, ytr_reg, xte_reg, yte_reg, 'regression',
                GBMRegressor(boosting_type='dart', **params)
            ],
            [
                xtr, ytr, xte, yte, 'classification',
                GBMClassifier(boosting_type='gbdt', **params)
            ],
            [
                xtr, ytr, xte, yte, 'classification',
                GBMClassifier(boosting_type='dart', **params)
            ],
        ]

        for xtr, ytr, xte, yte, name, clf in clfs:
            clf.fit(xtr, ytr, test_data=[(xte, yte)])

            if name == 'regression':
                score = metrics.mean_squared_error(yte, clf.predict(xte))
                assert (score < 1.
                        and clf.best_round < clf.param['num_iterations'])
            else:
                score = metrics.accuracy_score(yte, clf.predict(xte))
                assert (score > 0.7
                        and clf.best_round < clf.param['num_iterations'])
    def fit(self,
            X_train,
            y_train,
            X_eval=None,
            y_eval=None,
            seed=42,
            feature_names=None,
            eval_func=None,
            **kwa):
        params = self.params.copy()
        params['bagging_seed'] = seed
        params['feature_fraction_seed'] = seed + 3

        self.model = GBMClassifier(**params)

        if X_eval is None:
            self.model.fit(X_train, y_train)
        else:
            self.model.fit(X_train, y_train, test_data=[(X_eval, y_eval)])
class LightGBM(BaseAlgo):

    default_params = {'exec_path': 'lightgbm', 'num_threads': 4}

    def __init__(self, params):
        self.params = self.default_params.copy()

        for k in params:
            self.params[k] = params[k]

    def fit(self,
            X_train,
            y_train,
            X_eval=None,
            y_eval=None,
            seed=42,
            feature_names=None,
            eval_func=None,
            **kwa):
        params = self.params.copy()
        params['bagging_seed'] = seed
        params['feature_fraction_seed'] = seed + 3

        self.model = GBMClassifier(**params)

        if X_eval is None:
            self.model.fit(X_train, y_train)
        else:
            self.model.fit(X_train, y_train, test_data=[(X_eval, y_eval)])

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict(X)
Ejemplo n.º 5
0
    def test_pickle(self):

        params = {'exec_path': path_to_exec, 'verbose': False}

        clfs = [
            [X, Y, GBMClassifier(**params)],
            [Xreg, Yreg, GBMRegressor(**params)],
        ]

        for x, y, clf in clfs:
            clf.fit(X, Y)
            pickle.dump(clf, open("clf_gbm.pkl", "wb"))
            clf2 = pickle.load(open("clf_gbm.pkl", "rb"))
            assert np.allclose(clf.predict(X), clf2.predict(X))
Ejemplo n.º 6
0
    def test_simple_fit(self):

        params = dict(exec_path=path_to_exec,
                      num_iterations=100,
                      min_data_in_leaf=1,
                      learning_rate=0.1,
                      num_leaves=5,
                      max_depth=10)
        clfs = [
            [
                Xreg, Yreg, 'regression',
                GBMRegressor(boosting_type='gbdt', **params)
            ],
            [
                Xreg, Yreg, 'regression',
                GBMRegressor(boosting_type='dart', **params)
            ],
            [
                X, Y, 'classification',
                GBMClassifier(boosting_type='gbdt', **params)
            ],
            [
                X, Y, 'classification',
                GBMClassifier(boosting_type='dart', **params)
            ],
        ]

        for x, y, name, clf in clfs:
            clf.fit(x, y, init_scores=np.zeros(x.shape[0]))

            if name == 'regression':
                score = metrics.mean_squared_error(y, clf.predict(x))
                score < 1.
            else:
                score = metrics.accuracy_score(Y, clf.predict(X))
                assert score > 0.9
Ejemplo n.º 7
0
    def test_multiclass(self):

        clf = GBMClassifier(exec_path=path_to_exec,
                            min_data_in_leaf=1,
                            learning_rate=0.1,
                            num_leaves=5,
                            num_class=n_classes,
                            metric='multi_logloss',
                            application='multiclass',
                            num_iterations=100)
        clf.fit(Xmulti, Ymulti.argmax(-1))
        clf.fit(Xmulti,
                Ymulti.argmax(-1),
                test_data=[(Xmulti, Ymulti.argmax(-1))])
        score = metrics.accuracy_score(Ymulti.argmax(-1), clf.predict(Xmulti))
        assert score > 0.8
Ejemplo n.º 8
0
# Parameters
seed = 1337
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"
np.random.seed(seed)  # for reproducibility

X, Y = datasets.make_classification(n_samples=1000,
                                    n_features=100,
                                    n_classes=2,
                                    random_state=seed)

# 'exec_path' is the path to lightgbm executable
gbm = GBMClassifier(exec_path=path_to_exec,
                    num_iterations=1000,
                    learning_rate=0.05,
                    min_data_in_leaf=1,
                    num_leaves=5,
                    metric='binary_logloss',
                    verbose=True)

param_grid = {
    'learning_rate': [0.1, 0.04],
    'min_data_in_leaf': [1, 10],
    'bagging_fraction': [0.5, 0.9]
}

scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2)

clf.fit(X, Y)
Ejemplo n.º 9
0
seed = 1337
nfolds = 5
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"

np.random.seed(seed)  # for reproducibility

X, Y = datasets.make_classification(n_samples=1000,
                                    n_features=100,
                                    n_classes=2,
                                    random_state=seed)

# 'exec_path' is the path to lightgbm executable
gbm = GBMClassifier(exec_path=path_to_exec,
                    num_iterations=100,
                    learning_rate=0.075,
                    min_data_in_leaf=1,
                    bagging_freq=10,
                    metric='binary_error',
                    early_stopping_round=10)

param_grid = {
    'learning_rate': [0.1, 0.04],
    'min_data_in_leaf': [1, 10],
    'bagging_fraction': [0.5, 0.9]
}

scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2)

clf.fit(X, Y)
Ejemplo n.º 10
0
# -*- coding: utf-8 -*-
"""
@author: Ardalan MEHRANI <*****@*****.**>
@brief:
"""
import pickle
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMClassifier

# Parameters
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"

X, Y = datasets.make_classification(n_samples=1000,
                                    n_features=100,
                                    random_state=1337)

# 'exec_path' is the path to lightgbm executable
clf = GBMClassifier(exec_path=path_to_exec, verbose=False)

clf.fit(X, Y)

y_pred = clf.predict(X)

print("Accuracy: ", metrics.accuracy_score(Y, y_pred))

# The sklearn API models are picklable
print("Pickling sklearn API models")
pickle.dump(clf, open("clf_gbm.pkl", "wb"))
clf2 = pickle.load(open("clf_gbm.pkl", "rb"))
print(np.allclose(clf.predict(X), clf2.predict(X)))
Ejemplo n.º 11
0
# Parameters
seed = 1337
nfolds = 5
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"
np.random.seed(seed)  # for reproducibility

X, Y = datasets.make_classification(n_samples=1000,
                                    n_features=500,
                                    random_state=seed)

skf = model_selection.StratifiedKFold(n_splits=nfolds, random_state=seed)

clf = GBMClassifier(exec_path=path_to_exec,
                    num_iterations=1000,
                    min_data_in_leaf=1,
                    num_leaves=10,
                    metric='binary_error',
                    learning_rate=0.1,
                    early_stopping_round=10,
                    verbose=False)

best_rounds = []
scores = []
for i, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
    x_train = X[train_idx, :]
    y_train = Y[train_idx]

    x_valid = X[valid_idx, :]
    y_valid = Y[valid_idx]

    clf.fit(x_train, y_train, test_data=[(x_valid, y_valid)])
    best_round = clf.best_round
Ejemplo n.º 12
0
# for step  in [6]:
result_preb = pd.DataFrame({'tel': valid['tel']})
for step in [2]:
    print '---------------------', step

    # train_X,test_X,train_Y,test_Y=train_test_split(index_data,index_lable ,  test_size=0.25, random_state=step)
    from imblearn.combine import SMOTEENN, SMOTETomek
    texec = u"E:\\code\\Debug\\lightgbm.exe"
    import subprocess
    # subprocess.Popen(texec)
    # X, Y = datasets.make_classification(n_samples=200, n_features=10)
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        index_data, index_lable, test_size=0.3)

    gbm = GBMClassifier(exec_path=texec,
                        metric='binary_error,auc',
                        early_stopping_round=10,
                        bagging_freq=10)

    param_grid = {'learning_rate': [0.1, 0.04], 'bagging_fraction': [0.5, 0.9]}

    scorer = metrics.make_scorer(metrics.accuracy_score,
                                 greater_is_better=True)
    clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2)

    clf.fit(x_train, y_train)

    print("Best score: ", clf.best_score_)
    print("Best params: ", clf.best_params_)
    # clf = GBMClassifier(exec_path=texec, min_data_in_leaf=10 ,
    #                     metric='auc',
    #                     # feature_fraction=0.9,
@author: Ardalan MEHRANI <*****@*****.**>
@brief:
"""
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMClassifier


# Parameters
seed = 1337
n_classes = 10
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"
np.random.seed(seed)  # for reproducibility

X, Y = datasets.load_digits(return_X_y=True, n_class=n_classes)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=seed)

# 'exec_path' is the path to lightgbm executable
clf = GBMClassifier(exec_path=path_to_exec, num_class=n_classes, metric='multi_logloss',
                    application='multiclass', num_iterations=1000,
                    min_data_in_leaf=1, num_leaves=5, early_stopping_round=20)

clf.fit(x_train, y_train, test_data=[(x_test, y_test)])

y_prob = clf.predict_proba(x_test)
y_pred = y_prob.argmax(-1)

print("Log loss: ", metrics.log_loss(y_test, y_prob))
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("Best round: ", clf.best_round)
Ejemplo n.º 14
0
def get_model_obj(modelType, n_clusters=None, **kwargs):
    if modelType == 'knn':
        from sklearn.neighbors import KNeighborsClassifier
        # 6 seems to give the best trade-off between accuracy and precision
        knn = KNeighborsClassifier(n_neighbors=6, **kwargs)
        return knn
    elif modelType == 'gaussianNB':
        from sklearn.naive_bayes import GaussianNB
        gnb = GaussianNB(**kwargs)
        return gnb

    elif modelType == 'multinomialNB':
        from sklearn.naive_bayes import MultinomialNB
        # TODO: figure out how to configure binomial distribution
        mnb = MultinomialNB(**kwargs)
        return mnb

    elif modelType == 'bernoulliNB':
        from sklearn.naive_bayes import BernoulliNB
        bnb = BernoulliNB(**kwargs)
        return bnb

    elif modelType == 'randomForest':
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier(random_state=234, **kwargs)
        return rfc

    elif modelType == 'svm':
        from sklearn.svm import SVC
        svc = SVC(random_state=0, probability=True, **kwargs)
        return svc

    elif modelType == 'LinearRegression':
        #assert column, "Column name required for building a linear model"
        #assert dataframe[column].shape == target.shape
        from sklearn import linear_model
        l_reg = linear_model.LinearRegression(**kwargs)
        return l_reg

    elif modelType == 'RidgeRegression':
        from sklearn.linear_model import Ridge
        if not kwargs:
            kwargs = {'alpha': 0.5}
        ridge_reg = Ridge(**kwargs)
        return ridge_reg

    elif modelType == 'RidgeRegressionCV':
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alphas': [0.1, 1.0, 10.0]}
        ridge_cv_reg = linear_model.RidgeCV(**kwargs)
        return ridge_cv_reg

    elif modelType == 'LassoRegression':
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alpha': 0.1}
        lasso_reg = linear_model.Lasso(**kwargs)
        return lasso_reg

    elif modelType == 'ElasticNetRegression':
        from sklearn.metrics import r2_score
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alpha': 0.1, 'l1_ratio': 0.7}
        enet_reg = linear_model.ElasticNet(**kwargs)
        return enet_reg

    elif modelType == 'LogisticRegression':
        from sklearn.linear_model import LogisticRegression
        log_reg = LogisticRegression(random_state=123, **kwargs)
        return log_reg

    elif modelType == 'RANSACRegression':
        from sklearn.linear_model import LinearRegression, RANSACRegressor
        ransac_model = RANSACRegressor(LinearRegression())
        return ransac_model

    elif modelType == 'kde':
        from sklearn.neighbors.kde import KernelDensity
        kde = KernelDensity(kernel='gaussian', bandwidth=0.2, **kwargs)
        return kde

    elif modelType == 'AR':
        import statsmodels.api as sm
        # fit an AR model and forecast
        ar_fitted = sm.tsa.AR(dataframe).fit(maxlag=9,
                                             method='mle',
                                             disp=-1,
                                             **kwargs)
        #ts_forecast = ar_fitted.predict(start='2008', end='2050')
        return ar_fitted

    elif modelType == 'SARIMAX':
        mod = sm.tsa.statespace.SARIMAX(df.riders,
                                        trend='n',
                                        order=(0, 1, 0),
                                        seasonal_order=(1, 1, 1, 12),
                                        **kwargs)
        return mod

    elif modelType == 'sgd':
        # Online classifiers http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_comparison.html
        from sklearn.linear_model import SGDClassifier
        sgd = SGDClassifier(**kwargs)
        return sgd

    elif modelType == 'perceptron':
        from sklearn.linear_model import Perceptron
        perceptron = Perceptron(**kwargs)
        return perceptron

    elif modelType == 'xgboost':
        import xgboost as xgb
        xgbm = xgb.XGBClassifier(**kwargs)
        return xgbm

    elif modelType == 'baseNN':
        from keras.models import Sequential
        from keras.layers import Dense
        # create model
        model = Sequential()
        assert args.get('inputParams', None)
        assert args.get('outputParams', None)
        model.add(Dense(inputParams))
        model.add(Dense(outputParams))
        if args.get('compileParams'):
            # Compile model
            model.compile(
                compileParams
            )  # loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    elif modelType == 'lightGBMRegression':
        from pylightgbm.models import GBMRegressor
        lgbm_lreg = GBMRegressor(num_iterations=100,
                                 early_stopping_round=10,
                                 num_leaves=10,
                                 min_data_in_leaf=10)
        return lgbm_lreg

    elif modelType == 'lightGBMBinaryClass':
        from pylightgbm.models import GBMClassifier
        lgbm_bc = GBMClassifier(metric='binary_error', min_data_in_leaf=1)
        return lgbm_bc

    # Clustering models
    elif modelType == 'KMeans':
        assert n_clusters, "Number of clusters argument mandatory"
        cluster_callable = KMeans
        # seed of 10 for reproducibility.
        clusterer = cluster_callable(n_clusters=n_clusters, random_state=10)
        return clusterer

    elif modelType == 'dbscan':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        cluster_callable = DBSCAN
        clusterer = cluster_callable(eps=0.5)
        return clusterer

    elif modelType == 'affinity_prop':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        clusterer = AffinityPropagation(damping=.9, preference=-200)
        return clusterer
    elif modelType == 'spectral':
        assert n_clusters, "Number of clusters argument mandatory"
        clusterer = SpectralClustering(n_clusters=n_clusters,
                                       eigen_solver='arpack',
                                       affinity="nearest_neighbors")
        return clusterer
    elif modelType == 'birch':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        clusterer = Birch(n_clusters=2)
        return clusterer

    elif modelType == 'agglomerativeCluster':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(dataframe,
                                        n_neighbors=10,
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        clusterer = AgglomerativeClustering(n_clusters=cluster,
                                            linkage='ward',
                                            connectivity=connectivity)
        return clusterer

    elif modelType == 'meanShift':
        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(dataframe, quantile=0.3)
        clusterer = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        return clusterer

    elif modelType == 'gmm':
        from sklearn import mixture
        gmm = mixture.GaussianMixture(n_components=5, covariance_type='full')
        return gmm

    elif modelType == 'dgmm':
        from sklearn import mixture
        dgmm = mixture.BayesianGaussianMixture(n_components=5,
                                               covariance_type='full')
        return dgmm

    else:
        raise 'Unknown model type: see utils.py for available'
def get_classifiers(names):

    classifiers = []
    for name in names:
        if name == 'LogisticRegression':
            clf = LogisticRegression(penalty='l1',
                                     C=0.007,
                                     random_state=CONFIG['RANDOM_SEED'])
        elif name == 'XGBClassifier':
            clf = XGBClassifier(
                base_score=0.5,
                colsample_bylevel=1,
                colsample_bytree=0.9,
                gamma=0.7,
                learning_rate=0.1,
                max_delta_step=0,
                max_depth=6,
                min_child_weight=9.0,
                missing=None,
                n_estimators=1500,
                nthread=-1,
                objective='binary:logistic',
                reg_alpha=0,
                reg_lambda=1,
                # scale_pos_weight=1,
                seed=CONFIG['RANDOM_SEED'],
                silent=True,
                subsample=0.9)
        elif name == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier(n_estimators=50,
                                       max_depth=None,
                                       min_samples_split=10,
                                       min_samples_leaf=5,
                                       max_features='auto',
                                       n_jobs=-1,
                                       random_state=CONFIG['RANDOM_SEED'])
        elif name == 'GBMClassifier':
            clf = GBMClassifier(exec_path="~/LightGBM/lightgbm",
                                config="",
                                application='binary',
                                num_iterations=10,
                                learning_rate=0.1,
                                num_leaves=127,
                                tree_learner="serial",
                                num_threads=-1,
                                min_data_in_leaf=100,
                                metric='binary_logloss,',
                                is_training_metric='False',
                                feature_fraction=1.,
                                feature_fraction_seed=2,
                                bagging_fraction=1.,
                                bagging_freq=0,
                                bagging_seed=3,
                                metric_freq=1,
                                early_stopping_round=0,
                                max_bin=255,
                                is_unbalance=False,
                                num_class=1,
                                boosting_type='gbdt',
                                min_sum_hessian_in_leaf=10,
                                drop_rate=0.01,
                                drop_seed=4,
                                max_depth=-1,
                                lambda_l1=0.,
                                lambda_l2=0.,
                                min_gain_to_split=0.,
                                verbose=False,
                                model=None)
        else:
            raise ValueError('Unknown classifier name.')

        classifiers.append(clf)

    return classifiers
# Parameters
seed = 1337
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"
np.random.seed(seed)  # for reproducibility

X, Y = datasets.make_classification(n_samples=1000, n_features=100, n_classes=2, random_state=seed)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=seed)

params = {'exec_path': path_to_exec,
          'num_iterations': 1000, 'learning_rate': 0.01,
          'min_data_in_leaf': 1, 'num_leaves': 5,
          'metric': 'binary_error', 'verbose': False,
          'early_stopping_round': 20}

clfs = [
    ['gbdt', GBMClassifier(boosting_type='gbdt', **params)],
    ['dart', GBMClassifier(boosting_type='dart', drop_rate=0.02, drop_seed=4, **params)],
]

for boosting_type, clf in clfs:

    clf.fit(x_train, y_train, test_data=[(x_test, y_test)])
    y_prob = clf.predict_proba(x_test)
    y_pred = y_prob.argmax(-1)

    print("booster {} loss: {}, accuracy: {}, best round: {}".format(
        boosting_type,
        metrics.log_loss(y_test, y_prob),
        metrics.accuracy_score(y_test, y_pred),
        clf.best_round
    ))
Ejemplo n.º 17
0
bst1 = xgb.train(params, dtrain, params['n'])
# ------------------------------------------------------------------
params = {
    'exec_path': path_to_exec,
    'num_iterations': 108,
    'learning_rate': 0.079,
    'num_leaves': 13,
    'metric': 'binary_error',
    'min_sum_hessian_in_leaf': 1,
    'bagging_fraction': 0.642,
    'bagging_freq': 1,
    'verbose': 0
}

bst2 = GBMClassifier(boosting_type='gbdt', **params)
bst2.fit(X_train, y_train)
# ------------------------------------------------------------------
params_est = {
    'n_estimators': 300,
    'loss': 'exponential',
    'learning_rate': 0.08,
    'subsample': 0.6910000000000001,
    'min_samples_leaf': 340,
    'max_features': 53,
    'random_state': 1
}
bst3 = GradientBoostingClassifier(**params_est)
bst3.fit(X_train, y_train)
# ------------------------------------------------------------------
from keras.callbacks import Callback as keras_clb
Ejemplo n.º 18
0
# Parameters
seed = 1337
nfolds = 5
test_size = 0.2
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"
np.random.seed(seed)  # for reproducibility

X, Y = datasets.make_classification(n_samples=1000,
                                    n_features=100,
                                    random_state=seed)
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, Y, test_size=test_size, random_state=seed)

# 'exec_path' is the path to lightgbm executable
clf = GBMClassifier(exec_path=path_to_exec,
                    num_iterations=1000,
                    learning_rate=0.01,
                    min_data_in_leaf=1,
                    num_leaves=5,
                    metric='binary_error',
                    early_stopping_round=20)

clf.fit(x_train, y_train, test_data=[(x_test, y_test)])

y_prob = clf.predict_proba(x_test)
y_pred = y_prob.argmax(-1)

print("Log loss: ", metrics.log_loss(y_test, y_prob))
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("Best round: ", clf.best_round)
Ejemplo n.º 19
0
"""
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMClassifier

# params
seed = 1337
np.random.seed(seed)  # for reproducibility

X, Y = datasets.make_classification(n_samples=1000,
                                    n_features=100,
                                    random_state=seed)
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, Y, test_size=0.2, random_state=seed)

# 'exec_path' is the path to lightgbm executable
clf = GBMClassifier(exec_path="~/Documents/apps/LightGBM/lightgbm",
                    num_iterations=100,
                    learning_rate=0.1,
                    min_data_in_leaf=1,
                    metric='binary_error',
                    early_stopping_round=10)

clf.fit(x_train, y_train, test_data=[(x_test, y_test)])

y_prob = clf.predict_proba(x_test)
y_pred = y_prob.argmax(-1)

print("Log loss: ", metrics.log_loss(y_test, y_prob))
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
Ejemplo n.º 20
0
                                    random_state=seed)
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, Y, test_size=test_size, random_state=seed)

params = {
    'exec_path': path_to_exec,
    'num_iterations': 1000,
    'learning_rate': 0.01,
    'early_stopping_round': 20,
    'min_data_in_leaf': 1,
    'num_leaves': 5,
    'verbose': False
}

clf_binary = GBMClassifier(application='binary',
                           metric='binary_error',
                           **params)
clf_multiclass = GBMClassifier(application='multiclass',
                               num_class=n_classes,
                               metric='multi_error',
                               **params)

for clf in [clf_binary, clf_multiclass]:

    clf.fit(x_train, y_train, test_data=[(x_test, y_test)])

    y_prob = clf.predict_proba(x_test)
    y_pred = y_prob.argmax(-1)

    print("Log loss: ", metrics.log_loss(y_test, y_prob))
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
Ejemplo n.º 21
0
    def test_grid_search(self):

        param_grid = {
            'learning_rate': [0.01, 0.1, 1],
            'num_leaves': [2, 5, 50],
            'min_data_in_leaf': [1, 10, 100],
            'bagging_fraction': [0.1, 1]
        }

        params = {
            'exec_path': path_to_exec,
            'num_threads': 2,
            'num_iterations': 100,
            'learning_rate': 0.1,
            'min_data_in_leaf': 1,
            'num_leaves': 10,
            'bagging_freq': 2,
            'verbose': False
        }

        clfs = [
            [
                Xreg, Yreg, 'regression',
                GBMRegressor(boosting_type='gbdt', metric='l2', **params)
            ],
            [
                Xreg, Yreg, 'regression',
                GBMRegressor(boosting_type='dart', metric='l2', **params)
            ],
            [
                X, Y, 'classification',
                GBMClassifier(boosting_type='gbdt',
                              metric='binary_logloss',
                              **params)
            ],
            [
                X, Y, 'classification',
                GBMClassifier(boosting_type='dart',
                              metric='binary_logloss',
                              **params)
            ],
        ]

        for x, y, name, clf in clfs:

            if name == 'regression':
                scorer = metrics.make_scorer(metrics.mean_squared_error,
                                             greater_is_better=False)
                grid = model_selection.GridSearchCV(clf,
                                                    param_grid,
                                                    scoring=scorer,
                                                    cv=2,
                                                    refit=True)
                grid.fit(x, y)

                score = metrics.mean_squared_error(y, grid.predict(x))
                print(score)
                assert score < 2000
            else:
                scorer = metrics.make_scorer(metrics.accuracy_score,
                                             greater_is_better=True)
                grid = model_selection.GridSearchCV(clf,
                                                    param_grid,
                                                    scoring=scorer,
                                                    cv=2,
                                                    refit=True)
                grid.fit(x, y)

                score = metrics.accuracy_score(y, grid.predict(x))
                print(score)
                assert score > .9
# Parameters
seed = 1337

np.random.seed(seed)  # for reproducibility

X, Y = datasets.make_classification(n_samples=1000,
                                    n_features=100,
                                    n_classes=2,
                                    random_state=seed)

# 'exec_path' is the path to lightgbm executable
gbm = GBMClassifier(exec_path="~/Documents/apps/LightGBM/lightgbm",
                    num_iterations=100,
                    learning_rate=0.075,
                    min_data_in_leaf=1,
                    bagging_freq=10,
                    metric='binary_error',
                    early_stopping_round=10)

param_grid = {
    'learning_rate': [0.1, 0.04],
    'min_data_in_leaf': [1, 10],
    'bagging_fraction': [0.5, 0.9]
}

scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2)

clf.fit(X, Y)
Ejemplo n.º 23
0
X_train, X_test, y_train, y_test = train_test_split(x_data,
                                                    y_data,
                                                    test_size=0.10,
                                                    random_state=_seed)

cl = GBMClassifier(
    exec_path=exec_path,
    boosting_type='gbdt',  # gbdt | dart | goss
    learning_rate=LEARNING_RATE,
    num_leaves=64,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=1e-4,
    num_iterations=5000,
    num_threads=4,
    early_stopping_round=EARLY_STOPPING,
    drop_rate=0.0001,
    max_depth=6,
    lambda_l1=0.,
    lambda_l2=0.,
    max_bin=63,
    feature_fraction=1.0,
    #bagging_fraction=0.5,
    #bagging_freq=3,
    verbose=True)
cl.fit(X_train, y_train, test_data=[(X_test, y_test)])

#</editor-fold>

#<editor-fold desc="Генерация сабмита">