def evaluate_lightgbm(params):
        print 'new iteration ', datetime.now().strftime('%H:%M')

        model = GBMRegressor(num_threads=8,
                             num_iterations=5000,
                             verbose=False,
                             early_stopping_round=25,
                             bagging_seed=2016,
                             metric='l1',
                             learning_rate=0.05,
                             max_depth=int(params['max_depth']),
                             num_leaves=int(params['num_leaves']),
                             feature_fraction=params['feature_fraction'],
                             bagging_fraction=params['bagging_fraction'],
                             min_data_in_leaf=int(params['min_data_in_leaf']),
                             lambda_l1=params['lambda_l1'],
                             lambda_l2=params['lambda_l2'])

        model.fit(X_train.values,
                  target_transform(y_train.values),
                  test_data=[(X_val.values, target_transform(y_val.values))])
        best_iter = model.best_round
        y_pred = target_inverse_transform(model.predict(X_val))
        y_pred_train = target_inverse_transform(model.predict(X_train))
        mae = mean_absolute_error(y_val, y_pred)
        mae_train = mean_absolute_error(y_val, y_pred_train)

        return {
            'loss': mae,
            'mae_train': mae_train,
            'status': STATUS_OK,
            'best_round': best_iter
        }
Esempio n. 2
0
    def evaluate_lightgbm(params):

        print 'new iteration ', datetime.now().strftime('%H:%M')

        model = GBMRegressor(
            num_threads=8,
            num_iterations=5000,
            verbose=False,
            early_stopping_round=25,
            bagging_seed=2016,
            metric='l1',
            learning_rate=0.1,
            max_depth=12,
            num_leaves=int(params['num_leaves']),
            # num_leaves=127,
            # feature_fraction=params['feature_fraction'],
            # bagging_fraction=params['bagging_fraction'],
            feature_fraction=0.7,
            bagging_fraction=0.7,
            min_data_in_leaf=int(params['min_data_in_leaf']),
            max_bin=int(params['max_bin']),
            # lambda_l1=params['lambda_l1'],
            # lambda_l2=params['lambda_l2']
        )

        for val, train in cv.split(X):
            X_train = X.iloc[train].values
            y_train = y.iloc[train].values
            X_val = X.iloc[val].values
            y_val = y.iloc[val].values

            model.fit(X_train,
                      target_transform(y_train),
                      test_data=[(X_val, target_transform(y_val))])
            best_iter = model.best_round
            y_pred = target_inverse_transform(model.predict(X_val))
            y_pred_train = target_inverse_transform(model.predict(X_train))
            mae = mean_absolute_error(y_val, y_pred)
            mae_train = mean_absolute_error(y_train, y_pred_train)
            break

        # best_iter /= float(n_folds)
        # mae /= n_folds
        # mae_train /= n_folds

        run_time = datetime.now() - start_time

        return {
            'loss': mae,
            'mae_train': mae_train,
            'status': STATUS_OK,
            'best_round': best_iter
        }
def evaluate_lightgbm(params):
    def target_transform(y, mu=200):
        return np.log(y + mu)

    def target_inverse_transform(y_tr, mu=200):
        return np.exp(y_tr) - mu

    print 'new iteration ', datetime.now().strftime('%H:%M')

    # Read and preprocess data

    df = pd.read_csv('/home/ledovsky/allstate/run_res/feat_train.csv')
    X = df.drop(['loss', 'id'], 1)
    y = df.loss

    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=0.2,
                                                      random_state=2016)

    model = GBMRegressor(num_threads=7,
                         num_iterations=5000,
                         verbose=False,
                         early_stopping_round=25,
                         bagging_seed=2016,
                         metric='l1',
                         learning_rate=0.1,
                         max_depth=int(params['max_depth']),
                         num_leaves=int(params['num_leaves']),
                         feature_fraction=params['feature_fraction'],
                         bagging_fraction=params['bagging_fraction'],
                         min_data_in_leaf=int(params['min_data_in_leaf']),
                         lambda_l1=params['lambda_l1'],
                         lambda_l2=params['lambda_l2'])

    model.fit(X_train.values,
              target_transform(y_train.values),
              test_data=[(X_val.values, target_transform(y_val.values))])
    best_iter = model.best_round
    y_pred = target_inverse_transform(model.predict(X_val))
    y_pred_train = target_inverse_transform(model.predict(X_train))
    mae = mean_absolute_error(y_val, y_pred)
    mae_train = mean_absolute_error(y_train, y_pred_train)

    return {
        'loss': mae,
        'mae_train': mae_train,
        'status': STATUS_OK,
        'best_round': best_iter
    }
def get_oof():
    pred_oob = np.zeros(X_train.shape[0])
    pred_test = np.zeros(X_test.shape[0])

    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        print "Fold = ", i
        x_tr = X_train[train_index]
        y_tr = y_train[train_index]

        x_te = X_train[test_index]
        y_te = y_train[test_index]

        pred = np.zeros(x_te.shape[0])

        for j in range(nbags):
            x_tr, y_tr = shuffle(x_tr, y_tr, random_state=RANDOM_STATE + i + j)
            lgbt_params = {
                'exec_path':
                os.path.expanduser('~/packages/LightGBM/lightgbm'
                                   ),  # Change this to your LighGBM path
                'config': '',
                'application': 'regression',
                'num_iterations': 3000,
                'learning_rate': 0.01,
                'num_leaves': 213,
                'num_threads': 8,
                'min_data_in_leaf': 4,
                'metric': 'l1',
                'feature_fraction': 0.2933,
                'feature_fraction_seed': 2016 + i + j,
                'bagging_fraction': 0.9804,
                'bagging_freq': 100,
                'bagging_seed': 2016 + i + j,
                'early_stopping_round': 25,
                # metric_freq=1,
                'verbose': False
            }
            clf = GBMRegressor(**lgbt_params)
            clf.fit(x_tr, y_tr)

            pred += np.exp(clf.predict(x_te))
            pred_test += np.exp(clf.predict(X_test))

        pred /= nbags
        pred_oob[test_index] = pred
        score = mean_absolute_error(np.exp(y_te), pred)
        print('Fold ', i, '- MAE:', score)

    return pred_oob, pred_test
class LightGBM(BaseAlgo):

    default_params = {'exec_path': 'lightgbm', 'num_threads': 4}

    def __init__(self, params):
        self.params = self.default_params.copy()

        for k in params:
            self.params[k] = params[k]

    def fit(self,
            X_train,
            y_train,
            X_eval=None,
            y_eval=None,
            seed=42,
            feature_names=None,
            eval_func=None,
            **kwa):
        params = self.params.copy()
        params['bagging_seed'] = seed
        params['feature_fraction_seed'] = seed + 3

        self.model = GBMRegressor(**params)

        if X_eval is None:
            self.model.fit(X_train, y_train)
        else:
            self.model.fit(X_train, y_train, test_data=[(X_eval, y_eval)])

    def predict(self, X):
        return self.model.predict(X)
def run():
    # Load data set
    X_train, Y_train, X_test, submission_file_content = load_data()
    Y_train = np.log(Y_train + 200)

    # Cross validation
    cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0)
    for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1):
        print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM))

        submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index))

        if os.path.isfile(submission_file_path):
            continue

        model = GBMRegressor(
            learning_rate=0.01,
            num_iterations=NUM_ITERATIONS,
            num_leaves=200,
            min_data_in_leaf=10,
            feature_fraction=0.3,
            feature_fraction_seed=cross_validation_index,
            bagging_fraction=0.8,
            bagging_freq=10,
            bagging_seed=cross_validation_index,
            metric="l1",
            metric_freq=10,
            early_stopping_round=EARLY_STOPPING_ROUND,
            num_threads=-1)

        model.fit(X_train[train_index], Y_train[train_index], test_data=[(X_train[valid_index], Y_train[valid_index])])

        # Perform the testing procedure
        Y_test = model.predict(X_test)

        # Save submission to disk
        if not os.path.isdir(SUBMISSION_FOLDER_PATH):
            os.makedirs(SUBMISSION_FOLDER_PATH)
        submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200
        submission_file_content.to_csv(submission_file_path, index=False)

    # Perform ensembling
    ensemble_predictions()

    print("All done!")
Esempio n. 7
0
        open('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_params),
             'wb+'))
    gbmr.fit(validate_features.values,
             validate_labels.values[:, 0],
             test_data=[(train_features.values, train_labels.values[:, 0])])

    importance = dict(gbmr.feature_importance(train_features.columns.tolist()))
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    df = pd.DataFrame(gbmr.feature_importance(train_features.columns.tolist()),
                      columns=['feature', 'importance'])
    df['importance'] = df['importance'] / df['importance'].sum()
    df.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time,
                                       model_feature_importance_csv),
              index=False)

    val_label = gbmr.predict(validate_features)
    val_frame = pd.Series(val_label, index=validate_features.index)
    val_frame.name = probability_consumed_label
    val_coupons = pd.read_csv(validate_path + 'dataset.csv')
    val_coupons = val_coupons.join(val_frame).join(
        val_frame.map(lambda x: 0. if x < 0.5 else 1.).rename('map')).join(
            pd.read_csv(validate_path + 'labels.csv')['Label'])
    val_coupons.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time,
                                                val_diff_file),
                       index=False)
    print confusion_matrix(val_coupons['Label'], val_coupons['map'])

    print gbmr.best_round
    print 'generate submission'
    labels = gbmr.predict(predict_features)
    frame = pd.Series(labels, index=predict_features.index)
Esempio n. 8
0
            num_leaves=200,
            num_threads=4,
            min_data_in_leaf=8,
            metric='l1',
            feature_fraction=0.3,
            feature_fraction_seed=rand_seed,
            bagging_fraction=0.8,
            bagging_freq=100,
            bagging_seed=rand_seed,
            verbose=False)
        # Train
        gbmr.fit(x_tr, y_tr, test_data=[(x_val, y_val)])

        # Apply to validation and test data
        print 'Bag: ' + str(j) + " Predicting..."
        pred += np.exp((gbmr.predict(x_val))) - shift
        pred_test += np.exp((gbmr.predict(xtest))) - shift

    # Save oob results
    pred /= nbags
    pred_oob[inTe] = pred
    score = mean_absolute_error(np.exp(y_val) - shift, pred)
    print 'Fold ' + str(i) + '- MAE:' + str(score)
    i += 1

# Get mean of pred_test
pred_test /= (nfolds * nbags)

## train predictions
df = pd.DataFrame({'loss': pred_oob})
df.to_csv('LightGBM2_preds_oob.csv', index=False)
Esempio n. 9
0
@brief:
"""
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMRegressor

# Parameters
seed = 1337
path_to_exec = "~/Documents/apps/LightGBM/lightgbm"

np.random.seed(seed)  # for reproducibility
X, y = datasets.load_diabetes(return_X_y=True)
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=seed)

# 'exec_path' is the path to lightgbm executable
clf = GBMRegressor(exec_path=path_to_exec,
                   num_iterations=1000,
                   learning_rate=0.01,
                   num_leaves=10,
                   is_training_metric=True,
                   min_data_in_leaf=10,
                   is_unbalance=True,
                   early_stopping_round=10,
                   verbose=True)

clf.fit(x_train, y_train, test_data=[(x_test, y_test)])
y_pred = clf.predict(x_test)
print("Mean Square Error: ", metrics.mean_squared_error(y_test, y_pred))
print("Best round: ", clf.best_round)
Esempio n. 10
0
    max_bin=850, #850
    early_stopping_round=50 #40
)

best=[]
score=[]

kf = KFold(tr_rows, n_folds=kfolds, shuffle=True,random_state=123)
for i, (train_index, test_index) in enumerate(kf):
    print('Fold {0}'.format(i + 1))
    X_train, X_val = train.iloc[train_index], train.iloc[test_index]
    y_train, y_val = y[train_index],y[test_index]

    gbmr.fit(X_train, y_train, test_data=[(X_val, y_val)])
    best.append(gbmr.best_round)
    oof_train[test_index]=gbmr.predict(X_val)
    scr=mean_absolute_error(np.exp(y_val)-shift,np.exp(oof_train[test_index])-shift)
    score.append(scr)
    
    allpredictions['p'+str(i)] =gbmr.predict(test)
    
    del X_train,X_val,y_train,y_val
    gc.collect()

print("Mean Abs Error:", mean_absolute_error(y_true=(np.exp(y)-shift), y_pred=(np.exp(oof_train)-shift)))

print(allpredictions.head())
print(np.mean(score))
print(np.mean(best))

submission = pd.read_csv('input/sample_submission.csv')
        # num_leaves=127,
        # feature_fraction=params['feature_fraction'],
        # bagging_fraction=params['bagging_fraction'],
        feature_fraction=0.7,
        bagging_fraction=0.7,
        min_data_in_leaf=450,
        max_bin=256,
        # lambda_l1=params['lambda_l1'],
        # lambda_l2=params['lambda_l2']
    )

    model.fit(X_train,
              target_transform(y_train),
              test_data=[(X_val, target_transform(y_val))])

    y_oob[val] = target_inverse_transform(model.predict(X_val))
    y_pred += target_inverse_transform(model.predict(X_test.values))

    mae = mean_absolute_error(y_val, y_oob[val])
    cv_score += mae
    best_iter += model.best_round

    print 'MAE = {}, BEST ITER = {}'.format(mae, model.best_round)

df_oob = df[['id']].copy()
df_oob['loss'] = y_oob
df_oob.to_csv('../run_res/feat_lgbm_bag_oob_1.csv', index=False)

y_pred /= n_folds
submission = df_test[['id']].copy()
submission['loss'] = y_pred
        bagging_fraction=1,
        bagging_freq=10,
        bagging_seed=seed,
        metric_freq=1,
        early_stopping_round=50
    )
    json.dump(gbmr.param, open('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_params), 'wb+'))
    gbmr.fit(validate_features.values, validate_labels.values[:, 0], test_data=[(train_features.values, train_labels.values[:, 0])])

    importance = dict(gbmr.feature_importance(train_features.columns.tolist()))
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    df = pd.DataFrame(gbmr.feature_importance(train_features.columns.tolist()), columns=['feature', 'importance'])
    df['importance'] = df['importance'] / df['importance'].sum()
    df.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_feature_importance_csv), index=False)

    val_label = gbmr.predict(validate_features)
    val_frame = pd.Series(val_label, index=validate_features.index)
    val_frame.name = probability_consumed_label
    val_coupons = pd.read_csv(validate_path + 'dataset.csv')
    val_coupons = val_coupons.join(val_frame).join(val_frame.map(lambda x: 0. if x < 0.5 else 1.).rename('map')).join(pd.read_csv(validate_path + 'labels.csv')['Label'])
    val_coupons.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time, val_diff_file), index=False)
    print confusion_matrix(val_coupons['Label'], val_coupons['map'])

    print gbmr.best_round
    print 'generate submission'
    labels = gbmr.predict(predict_features)
    frame = pd.Series(labels, index=predict_features.index)
    frame.name = probability_consumed_label

    plt.figure()
    frame.hist(figsize=(10, 8))