コード例 #1
0
    def evaluate_lightgbm(params):
        print 'new iteration ', datetime.now().strftime('%H:%M')

        model = GBMRegressor(num_threads=6,
                             num_iterations=5000,
                             verbose=False,
                             early_stopping_round=25,
                             bagging_seed=2016,
                             metric='l1',
                             learning_rate=0.01,
                             max_depth=int(params['max_depth']),
                             num_leaves=int(params['num_leaves']),
                             feature_fraction=params['feature_fraction'],
                             bagging_fraction=params['bagging_fraction'],
                             min_data_in_leaf=int(params['min_data_in_leaf']),
                             lambda_l1=params['lambda_l1'],
                             lambda_l2=params['lambda_l2'])

        model.fit(X_train.values,
                  target_transform(y_train.values),
                  test_data=[(X_val.values, target_transform(y_val.values))])
        best_iter = model.best_round
        y_pred = target_inverse_transform(model.predict(X_val))
        mae = mean_absolute_error(y_val, y_pred)

        return {'loss': mae, 'status': STATUS_OK, 'best_round': best_iter}
コード例 #2
0
class LightGBM(BaseAlgo):

    default_params = {'exec_path': 'lightgbm', 'num_threads': 4}

    def __init__(self, params):
        self.params = self.default_params.copy()

        for k in params:
            self.params[k] = params[k]

    def fit(self,
            X_train,
            y_train,
            X_eval=None,
            y_eval=None,
            seed=42,
            feature_names=None,
            eval_func=None,
            **kwa):
        params = self.params.copy()
        params['bagging_seed'] = seed
        params['feature_fraction_seed'] = seed + 3

        self.model = GBMRegressor(**params)

        if X_eval is None:
            self.model.fit(X_train, y_train)
        else:
            self.model.fit(X_train, y_train, test_data=[(X_eval, y_eval)])

    def predict(self, X):
        return self.model.predict(X)
コード例 #3
0
def lgbt_evaluate(num_leaves, min_data_in_leaf, feature_fraction,
                  bagging_fraction):
    lgbt = GBMRegressor(
        exec_path=os.path.expanduser('~/packages/LightGBM/lightgbm'
                                     ),  # Change this to your LighGBM path
        config='',
        application='regression',
        num_iterations=5000,
        learning_rate=0.01,
        num_leaves=int(round(num_leaves)),
        # tree_learner='serial',
        num_threads=4,
        min_data_in_leaf=int(round(min_data_in_leaf)),
        metric='l1',
        feature_fraction=max(feature_fraction, 0),
        feature_fraction_seed=2016,
        bagging_fraction=max(bagging_fraction, 0),
        bagging_freq=100,
        bagging_seed=2016,
        early_stopping_round=25,
        # metric_freq=1,
        verbose=False)

    kf = KFold(n_folds, shuffle=True,
               random_state=RANDOM_STATE).get_n_splits(X_train)
    cv = cross_val_score(lgbt,
                         X_train,
                         y_train,
                         cv=kf,
                         scoring=make_scorer(evalerror))
    return -cv.mean()
コード例 #4
0
    def test_sparse(self):

        params = {
            'exec_path': path_to_exec,
            'num_iterations': 1000,
            'verbose': False,
            'min_data_in_leaf': 1,
            'learning_rate': 0.1,
            'num_leaves': 5
        }

        clfs = [
            [sps.csr_matrix(X), Y, 'classification',
             GBMClassifier(**params)],
            [sps.csr_matrix(Xreg), Yreg, 'regression',
             GBMRegressor(**params)],
        ]

        for x, y, name, clf in clfs:
            clf.fit(x, y)

            if name == 'classification':
                score = metrics.accuracy_score(y, clf.predict(x))
                assert score > 0.9
            else:
                score = metrics.mean_squared_error(y, clf.predict(x))
                assert score < 1.
コード例 #5
0
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = GBMRegressor(
            learning_rate=0.01,
            num_iterations=NUM_ITERATIONS,
            num_leaves=200,
            min_data_in_leaf=10,
            feature_fraction=0.3,
            feature_fraction_seed=cross_validation_index,
            bagging_fraction=0.8,
            bagging_freq=10,
            bagging_seed=cross_validation_index,
            metric="l1",
            metric_freq=10,
            early_stopping_round=EARLY_STOPPING_ROUND,
            num_threads=-1)

        model.fit(X_train[train_index], Y_train[train_index], test_data=[(X_train[valid_index], Y_train[valid_index])])

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
コード例 #6
0
    def test_early_stopping(self):

        cv_params = dict(test_size=test_size, random_state=seed)
        xtr, xte, ytr, yte = model_selection.train_test_split(
            X, Y, **cv_params)
        xtr_reg, xte_reg, ytr_reg, yte_reg = model_selection.train_test_split(
            X, Y, **cv_params)

        params = dict(exec_path=path_to_exec,
                      num_iterations=10000,
                      min_data_in_leaf=3,
                      learning_rate=0.01,
                      num_leaves=2,
                      early_stopping_round=2)
        clfs = [
            [
                xtr_reg, ytr_reg, xte_reg, yte_reg, 'regression',
                GBMRegressor(boosting_type='gbdt', **params)
            ],
            [
                xtr_reg, ytr_reg, xte_reg, yte_reg, 'regression',
                GBMRegressor(boosting_type='dart', **params)
            ],
            [
                xtr, ytr, xte, yte, 'classification',
                GBMClassifier(boosting_type='gbdt', **params)
            ],
            [
                xtr, ytr, xte, yte, 'classification',
                GBMClassifier(boosting_type='dart', **params)
            ],
        ]

        for xtr, ytr, xte, yte, name, clf in clfs:
            clf.fit(xtr, ytr, test_data=[(xte, yte)])

            if name == 'regression':
                score = metrics.mean_squared_error(yte, clf.predict(xte))
                assert (score < 1.
                        and clf.best_round < clf.param['num_iterations'])
            else:
                score = metrics.accuracy_score(yte, clf.predict(xte))
                assert (score > 0.7
                        and clf.best_round < clf.param['num_iterations'])
コード例 #7
0
    def fit(self,
            X_train,
            y_train,
            X_eval=None,
            y_eval=None,
            seed=42,
            feature_names=None,
            eval_func=None,
            **kwa):
        params = self.params.copy()
        params['bagging_seed'] = seed
        params['feature_fraction_seed'] = seed + 3

        self.model = GBMRegressor(**params)

        if X_eval is None:
            self.model.fit(X_train, y_train)
        else:
            self.model.fit(X_train, y_train, test_data=[(X_eval, y_eval)])
コード例 #8
0
    def test_pickle(self):

        params = {'exec_path': path_to_exec, 'verbose': False}

        clfs = [
            [X, Y, GBMClassifier(**params)],
            [Xreg, Yreg, GBMRegressor(**params)],
        ]

        for x, y, clf in clfs:
            clf.fit(X, Y)
            pickle.dump(clf, open("clf_gbm.pkl", "wb"))
            clf2 = pickle.load(open("clf_gbm.pkl", "rb"))
            assert np.allclose(clf.predict(X), clf2.predict(X))
コード例 #9
0
    def test_simple_fit(self):

        params = dict(exec_path=path_to_exec,
                      num_iterations=100,
                      min_data_in_leaf=1,
                      learning_rate=0.1,
                      num_leaves=5,
                      max_depth=10)
        clfs = [
            [
                Xreg, Yreg, 'regression',
                GBMRegressor(boosting_type='gbdt', **params)
            ],
            [
                Xreg, Yreg, 'regression',
                GBMRegressor(boosting_type='dart', **params)
            ],
            [
                X, Y, 'classification',
                GBMClassifier(boosting_type='gbdt', **params)
            ],
            [
                X, Y, 'classification',
                GBMClassifier(boosting_type='dart', **params)
            ],
        ]

        for x, y, name, clf in clfs:
            clf.fit(x, y, init_scores=np.zeros(x.shape[0]))

            if name == 'regression':
                score = metrics.mean_squared_error(y, clf.predict(x))
                score < 1.
            else:
                score = metrics.accuracy_score(Y, clf.predict(X))
                assert score > 0.9
コード例 #10
0
    def evaluate_lightgbm(params):

        print 'new iteration ', datetime.now().strftime('%H:%M')

        model = GBMRegressor(
            num_threads=8,
            num_iterations=5000,
            verbose=False,
            early_stopping_round=25,
            bagging_seed=2016,
            metric='l1',
            learning_rate=0.1,
            max_depth=12,
            num_leaves=int(params['num_leaves']),
            # num_leaves=127,
            # feature_fraction=params['feature_fraction'],
            # bagging_fraction=params['bagging_fraction'],
            feature_fraction=0.7,
            bagging_fraction=0.7,
            min_data_in_leaf=int(params['min_data_in_leaf']),
            max_bin=int(params['max_bin']),
            # lambda_l1=params['lambda_l1'],
            # lambda_l2=params['lambda_l2']
        )

        for val, train in cv.split(X):
            X_train = X.iloc[train].values
            y_train = y.iloc[train].values
            X_val = X.iloc[val].values
            y_val = y.iloc[val].values

            model.fit(X_train,
                      target_transform(y_train),
                      test_data=[(X_val, target_transform(y_val))])
            best_iter = model.best_round
            y_pred = target_inverse_transform(model.predict(X_val))
            y_pred_train = target_inverse_transform(model.predict(X_train))
            mae = mean_absolute_error(y_val, y_pred)
            mae_train = mean_absolute_error(y_train, y_pred_train)
            break

        # best_iter /= float(n_folds)
        # mae /= n_folds
        # mae_train /= n_folds

        run_time = datetime.now() - start_time

        return {
            'loss': mae,
            'mae_train': mae_train,
            'status': STATUS_OK,
            'best_round': best_iter
        }
コード例 #11
0
def evaluate_lightgbm(params):
    def target_transform(y, mu=200):
        return np.log(y + mu)

    def target_inverse_transform(y_tr, mu=200):
        return np.exp(y_tr) - mu

    print 'new iteration ', datetime.now().strftime('%H:%M')

    # Read and preprocess data

    df = pd.read_csv('/home/ledovsky/allstate/run_res/feat_train.csv')
    X = df.drop(['loss', 'id'], 1)
    y = df.loss

    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=0.2,
                                                      random_state=2016)

    model = GBMRegressor(num_threads=7,
                         num_iterations=5000,
                         verbose=False,
                         early_stopping_round=25,
                         bagging_seed=2016,
                         metric='l1',
                         learning_rate=0.1,
                         max_depth=int(params['max_depth']),
                         num_leaves=int(params['num_leaves']),
                         feature_fraction=params['feature_fraction'],
                         bagging_fraction=params['bagging_fraction'],
                         min_data_in_leaf=int(params['min_data_in_leaf']),
                         lambda_l1=params['lambda_l1'],
                         lambda_l2=params['lambda_l2'])

    model.fit(X_train.values,
              target_transform(y_train.values),
              test_data=[(X_val.values, target_transform(y_val.values))])
    best_iter = model.best_round
    y_pred = target_inverse_transform(model.predict(X_val))
    y_pred_train = target_inverse_transform(model.predict(X_train))
    mae = mean_absolute_error(y_val, y_pred)
    mae_train = mean_absolute_error(y_train, y_pred_train)

    return {
        'loss': mae,
        'mae_train': mae_train,
        'status': STATUS_OK,
        'best_round': best_iter
    }
コード例 #12
0
def get_oof():
    pred_oob = np.zeros(X_train.shape[0])
    pred_test = np.zeros(X_test.shape[0])

    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        print "Fold = ", i
        x_tr = X_train[train_index]
        y_tr = y_train[train_index]

        x_te = X_train[test_index]
        y_te = y_train[test_index]

        pred = np.zeros(x_te.shape[0])

        for j in range(nbags):
            x_tr, y_tr = shuffle(x_tr, y_tr, random_state=RANDOM_STATE + i + j)
            lgbt_params = {
                'exec_path':
                os.path.expanduser('~/packages/LightGBM/lightgbm'
                                   ),  # Change this to your LighGBM path
                'config': '',
                'application': 'regression',
                'num_iterations': 3000,
                'learning_rate': 0.01,
                'num_leaves': 213,
                'num_threads': 8,
                'min_data_in_leaf': 4,
                'metric': 'l1',
                'feature_fraction': 0.2933,
                'feature_fraction_seed': 2016 + i + j,
                'bagging_fraction': 0.9804,
                'bagging_freq': 100,
                'bagging_seed': 2016 + i + j,
                'early_stopping_round': 25,
                # metric_freq=1,
                'verbose': False
            }
            clf = GBMRegressor(**lgbt_params)
            clf.fit(x_tr, y_tr)

            pred += np.exp(clf.predict(x_te))
            pred_test += np.exp(clf.predict(X_test))

        pred /= nbags
        pred_oob[test_index] = pred
        score = mean_absolute_error(np.exp(y_te), pred)
        print('Fold ', i, '- MAE:', score)

    return pred_oob, pred_test
コード例 #13
0
def lgm_evaluate(num_leaves, min_data_in_leaf, feature_fraction,
                 bagging_fraction, bagging_freq):
    lgm = GBMRegressor(exec_path=exec_path,
                       application='regression',
                       num_iterations=10000,
                       tree_learner='serial',
                       early_stopping_round=50,
                       learning_rate=0.01,
                       num_leaves=round(num_leaves),
                       min_data_in_leaf=round(min_data_in_leaf),
                       feature_fraction=max(feature_fraction, 0),
                       bagging_fraction=max(bagging_fraction, 0),
                       bagging_freq=round(bagging_freq),
                       metric='l2',
                       bagging_seed=RANDOM_STATE,
                       metric_freq=1,
                       verbose=False)

    kf = KFold(n_folds, shuffle=True,
               random_state=RANDOM_STATE).get_n_splits(X_train)

    return -cross_val_score(
        lgm, X_train, y_train, cv=kf, scoring=make_scorer(evalerror)).mean()
コード例 #14
0
# 'path_to_exec' is the path to lightgbm executable (lightgbm.exe on Windows)
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"
# for reproducibility
np.random.seed(seed)

datasets = datasets.load_boston(return_X_y=False)
X = datasets['data']
Y = datasets['target']
feature_names = datasets['feature_names']

clf_xgb = XGBRegressor(max_depth=3, n_estimators=1000)
clf_gbm = GBMRegressor(exec_path=path_to_exec,
                       num_iterations=1000,
                       learning_rate=0.01,
                       num_leaves=255,
                       min_data_in_leaf=1,
                       early_stopping_round=20,
                       verbose=False)

x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, Y, test_size=test_size, random_state=seed)

# Training the two models
clf_gbm.fit(x_train, y_train, test_data=[(x_test, y_test)])
clf_xgb.fit(x_train,
            y_train,
            eval_set=[(x_test, y_test)],
            eval_metric='rmse',
            early_stopping_rounds=20,
            verbose=False)
コード例 #15
0
@brief:
"""
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMRegressor

# Parameters
seed = 1337
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"

np.random.seed(seed)  # for reproducibility
X, y = datasets.load_diabetes(return_X_y=True)
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=seed)

# 'exec_path' is the path to lightgbm executable
clf = GBMRegressor(exec_path=path_to_exec,
                   num_iterations=1000,
                   learning_rate=0.01,
                   num_leaves=10,
                   is_training_metric=True,
                   min_data_in_leaf=10,
                   is_unbalance=True,
                   early_stopping_round=10,
                   verbose=True)

clf.fit(x_train, y_train, test_data=[(x_test, y_test)])
y_pred = clf.predict(x_test)
print("Mean Square Error: ", metrics.mean_squared_error(y_test, y_pred))
print("Best round: ", clf.best_round)
コード例 #16
0
def get_model_obj(modelType, n_clusters=None, **kwargs):
    if modelType == 'knn':
        from sklearn.neighbors import KNeighborsClassifier
        # 6 seems to give the best trade-off between accuracy and precision
        knn = KNeighborsClassifier(n_neighbors=6, **kwargs)
        return knn
    elif modelType == 'gaussianNB':
        from sklearn.naive_bayes import GaussianNB
        gnb = GaussianNB(**kwargs)
        return gnb

    elif modelType == 'multinomialNB':
        from sklearn.naive_bayes import MultinomialNB
        # TODO: figure out how to configure binomial distribution
        mnb = MultinomialNB(**kwargs)
        return mnb

    elif modelType == 'bernoulliNB':
        from sklearn.naive_bayes import BernoulliNB
        bnb = BernoulliNB(**kwargs)
        return bnb

    elif modelType == 'randomForest':
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier(random_state=234, **kwargs)
        return rfc

    elif modelType == 'svm':
        from sklearn.svm import SVC
        svc = SVC(random_state=0, probability=True, **kwargs)
        return svc

    elif modelType == 'LinearRegression':
        #assert column, "Column name required for building a linear model"
        #assert dataframe[column].shape == target.shape
        from sklearn import linear_model
        l_reg = linear_model.LinearRegression(**kwargs)
        return l_reg

    elif modelType == 'RidgeRegression':
        from sklearn.linear_model import Ridge
        if not kwargs:
            kwargs = {'alpha': 0.5}
        ridge_reg = Ridge(**kwargs)
        return ridge_reg

    elif modelType == 'RidgeRegressionCV':
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alphas': [0.1, 1.0, 10.0]}
        ridge_cv_reg = linear_model.RidgeCV(**kwargs)
        return ridge_cv_reg

    elif modelType == 'LassoRegression':
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alpha': 0.1}
        lasso_reg = linear_model.Lasso(**kwargs)
        return lasso_reg

    elif modelType == 'ElasticNetRegression':
        from sklearn.metrics import r2_score
        from sklearn import linear_model
        if not kwargs:
            kwargs = {'alpha': 0.1, 'l1_ratio': 0.7}
        enet_reg = linear_model.ElasticNet(**kwargs)
        return enet_reg

    elif modelType == 'LogisticRegression':
        from sklearn.linear_model import LogisticRegression
        log_reg = LogisticRegression(random_state=123, **kwargs)
        return log_reg

    elif modelType == 'RANSACRegression':
        from sklearn.linear_model import LinearRegression, RANSACRegressor
        ransac_model = RANSACRegressor(LinearRegression())
        return ransac_model

    elif modelType == 'kde':
        from sklearn.neighbors.kde import KernelDensity
        kde = KernelDensity(kernel='gaussian', bandwidth=0.2, **kwargs)
        return kde

    elif modelType == 'AR':
        import statsmodels.api as sm
        # fit an AR model and forecast
        ar_fitted = sm.tsa.AR(dataframe).fit(maxlag=9,
                                             method='mle',
                                             disp=-1,
                                             **kwargs)
        #ts_forecast = ar_fitted.predict(start='2008', end='2050')
        return ar_fitted

    elif modelType == 'SARIMAX':
        mod = sm.tsa.statespace.SARIMAX(df.riders,
                                        trend='n',
                                        order=(0, 1, 0),
                                        seasonal_order=(1, 1, 1, 12),
                                        **kwargs)
        return mod

    elif modelType == 'sgd':
        # Online classifiers http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_comparison.html
        from sklearn.linear_model import SGDClassifier
        sgd = SGDClassifier(**kwargs)
        return sgd

    elif modelType == 'perceptron':
        from sklearn.linear_model import Perceptron
        perceptron = Perceptron(**kwargs)
        return perceptron

    elif modelType == 'xgboost':
        import xgboost as xgb
        xgbm = xgb.XGBClassifier(**kwargs)
        return xgbm

    elif modelType == 'baseNN':
        from keras.models import Sequential
        from keras.layers import Dense
        # create model
        model = Sequential()
        assert args.get('inputParams', None)
        assert args.get('outputParams', None)
        model.add(Dense(inputParams))
        model.add(Dense(outputParams))
        if args.get('compileParams'):
            # Compile model
            model.compile(
                compileParams
            )  # loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    elif modelType == 'lightGBMRegression':
        from pylightgbm.models import GBMRegressor
        lgbm_lreg = GBMRegressor(num_iterations=100,
                                 early_stopping_round=10,
                                 num_leaves=10,
                                 min_data_in_leaf=10)
        return lgbm_lreg

    elif modelType == 'lightGBMBinaryClass':
        from pylightgbm.models import GBMClassifier
        lgbm_bc = GBMClassifier(metric='binary_error', min_data_in_leaf=1)
        return lgbm_bc

    # Clustering models
    elif modelType == 'KMeans':
        assert n_clusters, "Number of clusters argument mandatory"
        cluster_callable = KMeans
        # seed of 10 for reproducibility.
        clusterer = cluster_callable(n_clusters=n_clusters, random_state=10)
        return clusterer

    elif modelType == 'dbscan':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        cluster_callable = DBSCAN
        clusterer = cluster_callable(eps=0.5)
        return clusterer

    elif modelType == 'affinity_prop':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        clusterer = AffinityPropagation(damping=.9, preference=-200)
        return clusterer
    elif modelType == 'spectral':
        assert n_clusters, "Number of clusters argument mandatory"
        clusterer = SpectralClustering(n_clusters=n_clusters,
                                       eigen_solver='arpack',
                                       affinity="nearest_neighbors")
        return clusterer
    elif modelType == 'birch':
        if not n_clusters:
            logging.warn(
                "Number of clusters irrelevant for cluster type : %s" %
                (modelType))
        clusterer = Birch(n_clusters=2)
        return clusterer

    elif modelType == 'agglomerativeCluster':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(dataframe,
                                        n_neighbors=10,
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        clusterer = AgglomerativeClustering(n_clusters=cluster,
                                            linkage='ward',
                                            connectivity=connectivity)
        return clusterer

    elif modelType == 'meanShift':
        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(dataframe, quantile=0.3)
        clusterer = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        return clusterer

    elif modelType == 'gmm':
        from sklearn import mixture
        gmm = mixture.GaussianMixture(n_components=5, covariance_type='full')
        return gmm

    elif modelType == 'dgmm':
        from sklearn import mixture
        dgmm = mixture.BayesianGaussianMixture(n_components=5,
                                               covariance_type='full')
        return dgmm

    else:
        raise 'Unknown model type: see utils.py for available'
コード例 #17
0
ファイル: regression.py プロジェクト: zhouyonglong/pyLightGBM
# -*- coding: utf-8 -*-
"""
@author: Ardalan MEHRANI <*****@*****.**>
@brief:
"""
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMRegressor

# Parameters
seed = 1337

np.random.seed(seed)  # for reproducibility
X, y = datasets.load_diabetes(return_X_y=True)
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=seed)

# 'exec_path' is the path to lightgbm executable
clf = GBMRegressor(exec_path="~/Documents/apps/LightGBM/lightgbm",
                   num_iterations=10000,
                   learning_rate=0.01,
                   num_leaves=10,
                   min_data_in_leaf=10,
                   early_stopping_round=10)

clf.fit(x_train, y_train, test_data=[(x_test, y_test)])
y_pred = clf.predict(x_test)

print("Mean Square Error: ", metrics.mean_squared_error(y_test, y_pred))
コード例 #18
0
    X_train = X.iloc[train].values
    y_train = y.iloc[train].values
    X_val = X.iloc[val].values
    y_val = y.iloc[val].values

    model = GBMRegressor(
        num_threads=8,
        num_iterations=5000,
        verbose=False,
        early_stopping_round=25,
        bagging_seed=2016,
        metric='l1',
        learning_rate=0.05,
        max_depth=12,
        num_leaves=450,
        # num_leaves=127,
        # feature_fraction=params['feature_fraction'],
        # bagging_fraction=params['bagging_fraction'],
        feature_fraction=0.7,
        bagging_fraction=0.7,
        min_data_in_leaf=450,
        max_bin=256,
        # lambda_l1=params['lambda_l1'],
        # lambda_l2=params['lambda_l2']
    )

    model.fit(X_train,
              target_transform(y_train),
              test_data=[(X_val, target_transform(y_val))])

    y_oob[val] = target_inverse_transform(model.predict(X_val))
コード例 #19
0
    def test_grid_search(self):

        param_grid = {
            'learning_rate': [0.01, 0.1, 1],
            'num_leaves': [2, 5, 50],
            'min_data_in_leaf': [1, 10, 100],
            'bagging_fraction': [0.1, 1]
        }

        params = {
            'exec_path': path_to_exec,
            'num_threads': 2,
            'num_iterations': 100,
            'learning_rate': 0.1,
            'min_data_in_leaf': 1,
            'num_leaves': 10,
            'bagging_freq': 2,
            'verbose': False
        }

        clfs = [
            [
                Xreg, Yreg, 'regression',
                GBMRegressor(boosting_type='gbdt', metric='l2', **params)
            ],
            [
                Xreg, Yreg, 'regression',
                GBMRegressor(boosting_type='dart', metric='l2', **params)
            ],
            [
                X, Y, 'classification',
                GBMClassifier(boosting_type='gbdt',
                              metric='binary_logloss',
                              **params)
            ],
            [
                X, Y, 'classification',
                GBMClassifier(boosting_type='dart',
                              metric='binary_logloss',
                              **params)
            ],
        ]

        for x, y, name, clf in clfs:

            if name == 'regression':
                scorer = metrics.make_scorer(metrics.mean_squared_error,
                                             greater_is_better=False)
                grid = model_selection.GridSearchCV(clf,
                                                    param_grid,
                                                    scoring=scorer,
                                                    cv=2,
                                                    refit=True)
                grid.fit(x, y)

                score = metrics.mean_squared_error(y, grid.predict(x))
                print(score)
                assert score < 2000
            else:
                scorer = metrics.make_scorer(metrics.accuracy_score,
                                             greater_is_better=True)
                grid = model_selection.GridSearchCV(clf,
                                                    param_grid,
                                                    scoring=scorer,
                                                    cv=2,
                                                    refit=True)
                grid.fit(x, y)

                score = metrics.accuracy_score(y, grid.predict(x))
                print(score)
                assert score > .9
コード例 #20
0
from pylightgbm.models import GBMRegressor
from sklearn.cross_validation import train_test_split

data_loc = "C:/Users/rsoni106/Documents/Work/Methodology Work/Kaggle/Completed/Allstate/prepared_data/new_data"

train_data = pd.read_csv(data_loc+"/train_data_py.csv")

event = pd.read_csv(data_loc+"/event_py.csv")

test_data = pd.read_csv(data_loc+"/test_data_py.csv")

vars_model = [x for x in train.columns if x not in ("id","logloss")] # this creates a tuple

gbm = GBMRegressor(num_iterations=int(2558/0.9), learning_rate=0.01, num_leaves=200, min_data_in_leaf=8,
                   feature_fraction=0.3, bagging_fraction=0.8, bagging_freq=100, verbose=True, application='regression',
                   metric='l2',num_threads=2, exec_path = )

gbm.fit(train_data,event)




from heapq import nlargest
from operator import itemgetter

temp=0

for k,v in val.items():
    if temp == 10:
        break
    print(k,v)
コード例 #21
0
    y_tr = y[inTr]
    x_val = xtrain[inTe]
    y_val = y[inTe]
    pred = np.zeros(x_val.shape[0])

    for j in range(nbags):
        print 'Bag: ' + str(j)
        rand_seed = random.randint(1, 5000)
        gbmr = GBMRegressor(
            exec_path=path_to_exec,  # Change this to your LighGBM path
            config='',
            application='regression',
            num_iterations=int(2558 / 0.9),
            learning_rate=0.01,
            num_leaves=200,
            num_threads=4,
            min_data_in_leaf=8,
            metric='l1',
            feature_fraction=0.3,
            feature_fraction_seed=rand_seed,
            bagging_fraction=0.8,
            bagging_freq=100,
            bagging_seed=rand_seed,
            verbose=False)
        # Train
        gbmr.fit(x_tr, y_tr, test_data=[(x_val, y_val)])

        # Apply to validation and test data
        print 'Bag: ' + str(j) + " Predicting..."
        pred += np.exp((gbmr.predict(x_val))) - shift
        pred_test += np.exp((gbmr.predict(xtest))) - shift
コード例 #22
0
import numpy as np
from pylightgbm.models import GBMRegressor
from sklearn import datasets, metrics, model_selection

# Parameters
seed = 1337
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"
np.random.seed(seed)  # for reproducibility

X, Y = datasets.load_diabetes(return_X_y=True)

# 'exec_path' is the path to lightgbm executable
gbm = GBMRegressor(exec_path=path_to_exec,
                   num_iterations=100,
                   learning_rate=0.1,
                   min_data_in_leaf=1,
                   bagging_freq=10,
                   metric='binary_error',
                   early_stopping_round=10,
                   verbose=False)

param_grid = {
    'learning_rate': [0.1, 0.04],
    'min_data_in_leaf': [1, 10],
    'bagging_fraction': [0.5, 0.9]
}

scorer = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)
clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2)

clf.fit(X, Y)
コード例 #23
0
import numpy as np
from pylightgbm.models import GBMRegressor
from sklearn import datasets, metrics, model_selection

# Parameters
seed = 1337

np.random.seed(seed)  # for reproducibility

X, Y = datasets.load_diabetes(return_X_y=True)

# 'exec_path' is the path to lightgbm executable
gbm = GBMRegressor(exec_path="~/Documents/apps/LightGBM/lightgbm",
                   num_iterations=100,
                   learning_rate=0.1,
                   min_data_in_leaf=1,
                   bagging_freq=10,
                   metric='binary_error',
                   early_stopping_round=10)

param_grid = {
    'learning_rate': [0.1, 0.04],
    'min_data_in_leaf': [1, 10],
    'bagging_fraction': [0.5, 0.9]
}

scorer = metrics.make_scorer(metrics.mean_squared_error,
                             greater_is_better=False)
clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2)

clf.fit(X, Y)
コード例 #24
0
    validate_features.fillna(0, inplace=True)
    predict_features.fillna(0, inplace=True)

    create_feature_map(train_features.columns.tolist(), '{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_fmap_file))

    print 'LightGBM Training'
    seed = 13
    gbmr = GBMRegressor(
        exec_path='/usr/local/lib/python2.7/site-packages/pylightgbm/LightGBM/lightgbm',
        config='',
        application='regression',
        num_iterations=500,
        learning_rate=0.1,
        tree_learner='serial',
        min_data_in_leaf=10,
        metric='auc',
        feature_fraction=0.7,
        feature_fraction_seed=seed,
        bagging_fraction=1,
        bagging_freq=10,
        bagging_seed=seed,
        metric_freq=1,
        early_stopping_round=50
    )
    json.dump(gbmr.param, open('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_params), 'wb+'))
    gbmr.fit(validate_features.values, validate_labels.values[:, 0], test_data=[(train_features.values, train_labels.values[:, 0])])

    importance = dict(gbmr.feature_importance(train_features.columns.tolist()))
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    df = pd.DataFrame(gbmr.feature_importance(train_features.columns.tolist()), columns=['feature', 'importance'])
    df['importance'] = df['importance'] / df['importance'].sum()
コード例 #25
0
    create_feature_map(
        train_features.columns.tolist(),
        '{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_fmap_file))

    print 'LightGBM Training'
    seed = 13
    gbmr = GBMRegressor(
        exec_path=
        '/usr/local/lib/python2.7/site-packages/pylightgbm/LightGBM/lightgbm',
        config='',
        application='regression',
        num_iterations=500,
        learning_rate=0.1,
        tree_learner='serial',
        min_data_in_leaf=10,
        metric='auc',
        feature_fraction=0.7,
        feature_fraction_seed=seed,
        bagging_fraction=1,
        bagging_freq=10,
        bagging_seed=seed,
        metric_freq=1,
        early_stopping_round=50)
    json.dump(
        gbmr.param,
        open('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_params),
             'wb+'))
    gbmr.fit(validate_features.values,
             validate_labels.values[:, 0],
             test_data=[(train_features.values, train_labels.values[:, 0])])
コード例 #26
0
oof_train=np.zeros(tr_rows,)

seed = 42

nbest=10000
gbmr = GBMRegressor(
    exec_path='your_LightGBM_exec_path',
    config='',
    application='regression',
    num_iterations=nbest,
    learning_rate=0.002, #0.03, 0.002
    num_leaves=200,  #180
    tree_learner='serial',
    num_threads=48,
    min_data_in_leaf=130, #125
    metric='l1',
    feature_fraction=0.27, #0.75,0.3
    feature_fraction_seed=seed,
    bagging_fraction=0.9, #0.9
    bagging_freq=5, #5
    bagging_seed=seed,
    metric_freq=50,
    verbose=0,
    #min_hessian= 5,
    max_bin=850, #850
    early_stopping_round=50 #40
)

best=[]
score=[]

kf = KFold(tr_rows, n_folds=kfolds, shuffle=True,random_state=123)