def evaluate_lightgbm(params): print 'new iteration ', datetime.now().strftime('%H:%M') model = GBMRegressor(num_threads=8, num_iterations=5000, verbose=False, early_stopping_round=25, bagging_seed=2016, metric='l1', learning_rate=0.05, max_depth=int(params['max_depth']), num_leaves=int(params['num_leaves']), feature_fraction=params['feature_fraction'], bagging_fraction=params['bagging_fraction'], min_data_in_leaf=int(params['min_data_in_leaf']), lambda_l1=params['lambda_l1'], lambda_l2=params['lambda_l2']) model.fit(X_train.values, target_transform(y_train.values), test_data=[(X_val.values, target_transform(y_val.values))]) best_iter = model.best_round y_pred = target_inverse_transform(model.predict(X_val)) y_pred_train = target_inverse_transform(model.predict(X_train)) mae = mean_absolute_error(y_val, y_pred) mae_train = mean_absolute_error(y_val, y_pred_train) return { 'loss': mae, 'mae_train': mae_train, 'status': STATUS_OK, 'best_round': best_iter }
def evaluate_lightgbm(params): print 'new iteration ', datetime.now().strftime('%H:%M') model = GBMRegressor( num_threads=8, num_iterations=5000, verbose=False, early_stopping_round=25, bagging_seed=2016, metric='l1', learning_rate=0.1, max_depth=12, num_leaves=int(params['num_leaves']), # num_leaves=127, # feature_fraction=params['feature_fraction'], # bagging_fraction=params['bagging_fraction'], feature_fraction=0.7, bagging_fraction=0.7, min_data_in_leaf=int(params['min_data_in_leaf']), max_bin=int(params['max_bin']), # lambda_l1=params['lambda_l1'], # lambda_l2=params['lambda_l2'] ) for val, train in cv.split(X): X_train = X.iloc[train].values y_train = y.iloc[train].values X_val = X.iloc[val].values y_val = y.iloc[val].values model.fit(X_train, target_transform(y_train), test_data=[(X_val, target_transform(y_val))]) best_iter = model.best_round y_pred = target_inverse_transform(model.predict(X_val)) y_pred_train = target_inverse_transform(model.predict(X_train)) mae = mean_absolute_error(y_val, y_pred) mae_train = mean_absolute_error(y_train, y_pred_train) break # best_iter /= float(n_folds) # mae /= n_folds # mae_train /= n_folds run_time = datetime.now() - start_time return { 'loss': mae, 'mae_train': mae_train, 'status': STATUS_OK, 'best_round': best_iter }
def evaluate_lightgbm(params): def target_transform(y, mu=200): return np.log(y + mu) def target_inverse_transform(y_tr, mu=200): return np.exp(y_tr) - mu print 'new iteration ', datetime.now().strftime('%H:%M') # Read and preprocess data df = pd.read_csv('/home/ledovsky/allstate/run_res/feat_train.csv') X = df.drop(['loss', 'id'], 1) y = df.loss X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2016) model = GBMRegressor(num_threads=7, num_iterations=5000, verbose=False, early_stopping_round=25, bagging_seed=2016, metric='l1', learning_rate=0.1, max_depth=int(params['max_depth']), num_leaves=int(params['num_leaves']), feature_fraction=params['feature_fraction'], bagging_fraction=params['bagging_fraction'], min_data_in_leaf=int(params['min_data_in_leaf']), lambda_l1=params['lambda_l1'], lambda_l2=params['lambda_l2']) model.fit(X_train.values, target_transform(y_train.values), test_data=[(X_val.values, target_transform(y_val.values))]) best_iter = model.best_round y_pred = target_inverse_transform(model.predict(X_val)) y_pred_train = target_inverse_transform(model.predict(X_train)) mae = mean_absolute_error(y_val, y_pred) mae_train = mean_absolute_error(y_train, y_pred_train) return { 'loss': mae, 'mae_train': mae_train, 'status': STATUS_OK, 'best_round': best_iter }
def get_oof(): pred_oob = np.zeros(X_train.shape[0]) pred_test = np.zeros(X_test.shape[0]) for i, (train_index, test_index) in enumerate(kf.split(X_train)): print "Fold = ", i x_tr = X_train[train_index] y_tr = y_train[train_index] x_te = X_train[test_index] y_te = y_train[test_index] pred = np.zeros(x_te.shape[0]) for j in range(nbags): x_tr, y_tr = shuffle(x_tr, y_tr, random_state=RANDOM_STATE + i + j) lgbt_params = { 'exec_path': os.path.expanduser('~/packages/LightGBM/lightgbm' ), # Change this to your LighGBM path 'config': '', 'application': 'regression', 'num_iterations': 3000, 'learning_rate': 0.01, 'num_leaves': 213, 'num_threads': 8, 'min_data_in_leaf': 4, 'metric': 'l1', 'feature_fraction': 0.2933, 'feature_fraction_seed': 2016 + i + j, 'bagging_fraction': 0.9804, 'bagging_freq': 100, 'bagging_seed': 2016 + i + j, 'early_stopping_round': 25, # metric_freq=1, 'verbose': False } clf = GBMRegressor(**lgbt_params) clf.fit(x_tr, y_tr) pred += np.exp(clf.predict(x_te)) pred_test += np.exp(clf.predict(X_test)) pred /= nbags pred_oob[test_index] = pred score = mean_absolute_error(np.exp(y_te), pred) print('Fold ', i, '- MAE:', score) return pred_oob, pred_test
class LightGBM(BaseAlgo): default_params = {'exec_path': 'lightgbm', 'num_threads': 4} def __init__(self, params): self.params = self.default_params.copy() for k in params: self.params[k] = params[k] def fit(self, X_train, y_train, X_eval=None, y_eval=None, seed=42, feature_names=None, eval_func=None, **kwa): params = self.params.copy() params['bagging_seed'] = seed params['feature_fraction_seed'] = seed + 3 self.model = GBMRegressor(**params) if X_eval is None: self.model.fit(X_train, y_train) else: self.model.fit(X_train, y_train, test_data=[(X_eval, y_eval)]) def predict(self, X): return self.model.predict(X)
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = GBMRegressor( learning_rate=0.01, num_iterations=NUM_ITERATIONS, num_leaves=200, min_data_in_leaf=10, feature_fraction=0.3, feature_fraction_seed=cross_validation_index, bagging_fraction=0.8, bagging_freq=10, bagging_seed=cross_validation_index, metric="l1", metric_freq=10, early_stopping_round=EARLY_STOPPING_ROUND, num_threads=-1) model.fit(X_train[train_index], Y_train[train_index], test_data=[(X_train[valid_index], Y_train[valid_index])]) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
open('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_params), 'wb+')) gbmr.fit(validate_features.values, validate_labels.values[:, 0], test_data=[(train_features.values, train_labels.values[:, 0])]) importance = dict(gbmr.feature_importance(train_features.columns.tolist())) importance = sorted(importance.items(), key=operator.itemgetter(1)) df = pd.DataFrame(gbmr.feature_importance(train_features.columns.tolist()), columns=['feature', 'importance']) df['importance'] = df['importance'] / df['importance'].sum() df.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_feature_importance_csv), index=False) val_label = gbmr.predict(validate_features) val_frame = pd.Series(val_label, index=validate_features.index) val_frame.name = probability_consumed_label val_coupons = pd.read_csv(validate_path + 'dataset.csv') val_coupons = val_coupons.join(val_frame).join( val_frame.map(lambda x: 0. if x < 0.5 else 1.).rename('map')).join( pd.read_csv(validate_path + 'labels.csv')['Label']) val_coupons.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time, val_diff_file), index=False) print confusion_matrix(val_coupons['Label'], val_coupons['map']) print gbmr.best_round print 'generate submission' labels = gbmr.predict(predict_features) frame = pd.Series(labels, index=predict_features.index)
num_leaves=200, num_threads=4, min_data_in_leaf=8, metric='l1', feature_fraction=0.3, feature_fraction_seed=rand_seed, bagging_fraction=0.8, bagging_freq=100, bagging_seed=rand_seed, verbose=False) # Train gbmr.fit(x_tr, y_tr, test_data=[(x_val, y_val)]) # Apply to validation and test data print 'Bag: ' + str(j) + " Predicting..." pred += np.exp((gbmr.predict(x_val))) - shift pred_test += np.exp((gbmr.predict(xtest))) - shift # Save oob results pred /= nbags pred_oob[inTe] = pred score = mean_absolute_error(np.exp(y_val) - shift, pred) print 'Fold ' + str(i) + '- MAE:' + str(score) i += 1 # Get mean of pred_test pred_test /= (nfolds * nbags) ## train predictions df = pd.DataFrame({'loss': pred_oob}) df.to_csv('LightGBM2_preds_oob.csv', index=False)
@brief: """ import numpy as np from sklearn import datasets, metrics, model_selection from pylightgbm.models import GBMRegressor # Parameters seed = 1337 path_to_exec = "~/Documents/apps/LightGBM/lightgbm" np.random.seed(seed) # for reproducibility X, y = datasets.load_diabetes(return_X_y=True) x_train, x_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=seed) # 'exec_path' is the path to lightgbm executable clf = GBMRegressor(exec_path=path_to_exec, num_iterations=1000, learning_rate=0.01, num_leaves=10, is_training_metric=True, min_data_in_leaf=10, is_unbalance=True, early_stopping_round=10, verbose=True) clf.fit(x_train, y_train, test_data=[(x_test, y_test)]) y_pred = clf.predict(x_test) print("Mean Square Error: ", metrics.mean_squared_error(y_test, y_pred)) print("Best round: ", clf.best_round)
max_bin=850, #850 early_stopping_round=50 #40 ) best=[] score=[] kf = KFold(tr_rows, n_folds=kfolds, shuffle=True,random_state=123) for i, (train_index, test_index) in enumerate(kf): print('Fold {0}'.format(i + 1)) X_train, X_val = train.iloc[train_index], train.iloc[test_index] y_train, y_val = y[train_index],y[test_index] gbmr.fit(X_train, y_train, test_data=[(X_val, y_val)]) best.append(gbmr.best_round) oof_train[test_index]=gbmr.predict(X_val) scr=mean_absolute_error(np.exp(y_val)-shift,np.exp(oof_train[test_index])-shift) score.append(scr) allpredictions['p'+str(i)] =gbmr.predict(test) del X_train,X_val,y_train,y_val gc.collect() print("Mean Abs Error:", mean_absolute_error(y_true=(np.exp(y)-shift), y_pred=(np.exp(oof_train)-shift))) print(allpredictions.head()) print(np.mean(score)) print(np.mean(best)) submission = pd.read_csv('input/sample_submission.csv')
# num_leaves=127, # feature_fraction=params['feature_fraction'], # bagging_fraction=params['bagging_fraction'], feature_fraction=0.7, bagging_fraction=0.7, min_data_in_leaf=450, max_bin=256, # lambda_l1=params['lambda_l1'], # lambda_l2=params['lambda_l2'] ) model.fit(X_train, target_transform(y_train), test_data=[(X_val, target_transform(y_val))]) y_oob[val] = target_inverse_transform(model.predict(X_val)) y_pred += target_inverse_transform(model.predict(X_test.values)) mae = mean_absolute_error(y_val, y_oob[val]) cv_score += mae best_iter += model.best_round print 'MAE = {}, BEST ITER = {}'.format(mae, model.best_round) df_oob = df[['id']].copy() df_oob['loss'] = y_oob df_oob.to_csv('../run_res/feat_lgbm_bag_oob_1.csv', index=False) y_pred /= n_folds submission = df_test[['id']].copy() submission['loss'] = y_pred
bagging_fraction=1, bagging_freq=10, bagging_seed=seed, metric_freq=1, early_stopping_round=50 ) json.dump(gbmr.param, open('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_params), 'wb+')) gbmr.fit(validate_features.values, validate_labels.values[:, 0], test_data=[(train_features.values, train_labels.values[:, 0])]) importance = dict(gbmr.feature_importance(train_features.columns.tolist())) importance = sorted(importance.items(), key=operator.itemgetter(1)) df = pd.DataFrame(gbmr.feature_importance(train_features.columns.tolist()), columns=['feature', 'importance']) df['importance'] = df['importance'] / df['importance'].sum() df.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_feature_importance_csv), index=False) val_label = gbmr.predict(validate_features) val_frame = pd.Series(val_label, index=validate_features.index) val_frame.name = probability_consumed_label val_coupons = pd.read_csv(validate_path + 'dataset.csv') val_coupons = val_coupons.join(val_frame).join(val_frame.map(lambda x: 0. if x < 0.5 else 1.).rename('map')).join(pd.read_csv(validate_path + 'labels.csv')['Label']) val_coupons.to_csv('{0}_lgbm_{1}{2}'.format(model_path, exec_time, val_diff_file), index=False) print confusion_matrix(val_coupons['Label'], val_coupons['map']) print gbmr.best_round print 'generate submission' labels = gbmr.predict(predict_features) frame = pd.Series(labels, index=predict_features.index) frame.name = probability_consumed_label plt.figure() frame.hist(figsize=(10, 8))