def evaluate_lightgbm(params): print 'new iteration ', datetime.now().strftime('%H:%M') model = GBMRegressor(num_threads=6, num_iterations=5000, verbose=False, early_stopping_round=25, bagging_seed=2016, metric='l1', learning_rate=0.01, max_depth=int(params['max_depth']), num_leaves=int(params['num_leaves']), feature_fraction=params['feature_fraction'], bagging_fraction=params['bagging_fraction'], min_data_in_leaf=int(params['min_data_in_leaf']), lambda_l1=params['lambda_l1'], lambda_l2=params['lambda_l2']) model.fit(X_train.values, target_transform(y_train.values), test_data=[(X_val.values, target_transform(y_val.values))]) best_iter = model.best_round y_pred = target_inverse_transform(model.predict(X_val)) mae = mean_absolute_error(y_val, y_pred) return {'loss': mae, 'status': STATUS_OK, 'best_round': best_iter}
class LightGBM(BaseAlgo): default_params = {'exec_path': 'lightgbm', 'num_threads': 4} def __init__(self, params): self.params = self.default_params.copy() for k in params: self.params[k] = params[k] def fit(self, X_train, y_train, X_eval=None, y_eval=None, seed=42, feature_names=None, eval_func=None, **kwa): params = self.params.copy() params['bagging_seed'] = seed params['feature_fraction_seed'] = seed + 3 self.model = GBMRegressor(**params) if X_eval is None: self.model.fit(X_train, y_train) else: self.model.fit(X_train, y_train, test_data=[(X_eval, y_eval)]) def predict(self, X): return self.model.predict(X)
def lgbt_evaluate(num_leaves, min_data_in_leaf, feature_fraction, bagging_fraction): lgbt = GBMRegressor( exec_path=os.path.expanduser('~/packages/LightGBM/lightgbm' ), # Change this to your LighGBM path config='', application='regression', num_iterations=5000, learning_rate=0.01, num_leaves=int(round(num_leaves)), # tree_learner='serial', num_threads=4, min_data_in_leaf=int(round(min_data_in_leaf)), metric='l1', feature_fraction=max(feature_fraction, 0), feature_fraction_seed=2016, bagging_fraction=max(bagging_fraction, 0), bagging_freq=100, bagging_seed=2016, early_stopping_round=25, # metric_freq=1, verbose=False) kf = KFold(n_folds, shuffle=True, random_state=RANDOM_STATE).get_n_splits(X_train) cv = cross_val_score(lgbt, X_train, y_train, cv=kf, scoring=make_scorer(evalerror)) return -cv.mean()
def test_sparse(self): params = { 'exec_path': path_to_exec, 'num_iterations': 1000, 'verbose': False, 'min_data_in_leaf': 1, 'learning_rate': 0.1, 'num_leaves': 5 } clfs = [ [sps.csr_matrix(X), Y, 'classification', GBMClassifier(**params)], [sps.csr_matrix(Xreg), Yreg, 'regression', GBMRegressor(**params)], ] for x, y, name, clf in clfs: clf.fit(x, y) if name == 'classification': score = metrics.accuracy_score(y, clf.predict(x)) assert score > 0.9 else: score = metrics.mean_squared_error(y, clf.predict(x)) assert score < 1.
def run(): # Load data set X_train, Y_train, X_test, submission_file_content = load_data() Y_train = np.log(Y_train + 200) # Cross validation cross_validation_iterator = ShuffleSplit(n_splits=CROSS_VALIDATION_NUM, test_size=0.1, random_state=0) for cross_validation_index, (train_index, valid_index) in enumerate(cross_validation_iterator.split(X_train), start=1): print("Working on {}/{} ...".format(cross_validation_index, CROSS_VALIDATION_NUM)) submission_file_path = os.path.join(SUBMISSION_FOLDER_PATH, "submission_{}.csv".format(cross_validation_index)) if os.path.isfile(submission_file_path): continue model = GBMRegressor( learning_rate=0.01, num_iterations=NUM_ITERATIONS, num_leaves=200, min_data_in_leaf=10, feature_fraction=0.3, feature_fraction_seed=cross_validation_index, bagging_fraction=0.8, bagging_freq=10, bagging_seed=cross_validation_index, metric="l1", metric_freq=10, early_stopping_round=EARLY_STOPPING_ROUND, num_threads=-1) model.fit(X_train[train_index], Y_train[train_index], test_data=[(X_train[valid_index], Y_train[valid_index])]) # Perform the testing procedure Y_test = model.predict(X_test) # Save submission to disk if not os.path.isdir(SUBMISSION_FOLDER_PATH): os.makedirs(SUBMISSION_FOLDER_PATH) submission_file_content[LABEL_COLUMN_NAME] = np.exp(Y_test) - 200 submission_file_content.to_csv(submission_file_path, index=False) # Perform ensembling ensemble_predictions() print("All done!")
def test_early_stopping(self): cv_params = dict(test_size=test_size, random_state=seed) xtr, xte, ytr, yte = model_selection.train_test_split( X, Y, **cv_params) xtr_reg, xte_reg, ytr_reg, yte_reg = model_selection.train_test_split( X, Y, **cv_params) params = dict(exec_path=path_to_exec, num_iterations=10000, min_data_in_leaf=3, learning_rate=0.01, num_leaves=2, early_stopping_round=2) clfs = [ [ xtr_reg, ytr_reg, xte_reg, yte_reg, 'regression', GBMRegressor(boosting_type='gbdt', **params) ], [ xtr_reg, ytr_reg, xte_reg, yte_reg, 'regression', GBMRegressor(boosting_type='dart', **params) ], [ xtr, ytr, xte, yte, 'classification', GBMClassifier(boosting_type='gbdt', **params) ], [ xtr, ytr, xte, yte, 'classification', GBMClassifier(boosting_type='dart', **params) ], ] for xtr, ytr, xte, yte, name, clf in clfs: clf.fit(xtr, ytr, test_data=[(xte, yte)]) if name == 'regression': score = metrics.mean_squared_error(yte, clf.predict(xte)) assert (score < 1. and clf.best_round < clf.param['num_iterations']) else: score = metrics.accuracy_score(yte, clf.predict(xte)) assert (score > 0.7 and clf.best_round < clf.param['num_iterations'])
def fit(self, X_train, y_train, X_eval=None, y_eval=None, seed=42, feature_names=None, eval_func=None, **kwa): params = self.params.copy() params['bagging_seed'] = seed params['feature_fraction_seed'] = seed + 3 self.model = GBMRegressor(**params) if X_eval is None: self.model.fit(X_train, y_train) else: self.model.fit(X_train, y_train, test_data=[(X_eval, y_eval)])
def test_pickle(self): params = {'exec_path': path_to_exec, 'verbose': False} clfs = [ [X, Y, GBMClassifier(**params)], [Xreg, Yreg, GBMRegressor(**params)], ] for x, y, clf in clfs: clf.fit(X, Y) pickle.dump(clf, open("clf_gbm.pkl", "wb")) clf2 = pickle.load(open("clf_gbm.pkl", "rb")) assert np.allclose(clf.predict(X), clf2.predict(X))
def test_simple_fit(self): params = dict(exec_path=path_to_exec, num_iterations=100, min_data_in_leaf=1, learning_rate=0.1, num_leaves=5, max_depth=10) clfs = [ [ Xreg, Yreg, 'regression', GBMRegressor(boosting_type='gbdt', **params) ], [ Xreg, Yreg, 'regression', GBMRegressor(boosting_type='dart', **params) ], [ X, Y, 'classification', GBMClassifier(boosting_type='gbdt', **params) ], [ X, Y, 'classification', GBMClassifier(boosting_type='dart', **params) ], ] for x, y, name, clf in clfs: clf.fit(x, y, init_scores=np.zeros(x.shape[0])) if name == 'regression': score = metrics.mean_squared_error(y, clf.predict(x)) score < 1. else: score = metrics.accuracy_score(Y, clf.predict(X)) assert score > 0.9
def evaluate_lightgbm(params): print 'new iteration ', datetime.now().strftime('%H:%M') model = GBMRegressor( num_threads=8, num_iterations=5000, verbose=False, early_stopping_round=25, bagging_seed=2016, metric='l1', learning_rate=0.1, max_depth=12, num_leaves=int(params['num_leaves']), # num_leaves=127, # feature_fraction=params['feature_fraction'], # bagging_fraction=params['bagging_fraction'], feature_fraction=0.7, bagging_fraction=0.7, min_data_in_leaf=int(params['min_data_in_leaf']), max_bin=int(params['max_bin']), # lambda_l1=params['lambda_l1'], # lambda_l2=params['lambda_l2'] ) for val, train in cv.split(X): X_train = X.iloc[train].values y_train = y.iloc[train].values X_val = X.iloc[val].values y_val = y.iloc[val].values model.fit(X_train, target_transform(y_train), test_data=[(X_val, target_transform(y_val))]) best_iter = model.best_round y_pred = target_inverse_transform(model.predict(X_val)) y_pred_train = target_inverse_transform(model.predict(X_train)) mae = mean_absolute_error(y_val, y_pred) mae_train = mean_absolute_error(y_train, y_pred_train) break # best_iter /= float(n_folds) # mae /= n_folds # mae_train /= n_folds run_time = datetime.now() - start_time return { 'loss': mae, 'mae_train': mae_train, 'status': STATUS_OK, 'best_round': best_iter }
def evaluate_lightgbm(params): def target_transform(y, mu=200): return np.log(y + mu) def target_inverse_transform(y_tr, mu=200): return np.exp(y_tr) - mu print 'new iteration ', datetime.now().strftime('%H:%M') # Read and preprocess data df = pd.read_csv('/home/ledovsky/allstate/run_res/feat_train.csv') X = df.drop(['loss', 'id'], 1) y = df.loss X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2016) model = GBMRegressor(num_threads=7, num_iterations=5000, verbose=False, early_stopping_round=25, bagging_seed=2016, metric='l1', learning_rate=0.1, max_depth=int(params['max_depth']), num_leaves=int(params['num_leaves']), feature_fraction=params['feature_fraction'], bagging_fraction=params['bagging_fraction'], min_data_in_leaf=int(params['min_data_in_leaf']), lambda_l1=params['lambda_l1'], lambda_l2=params['lambda_l2']) model.fit(X_train.values, target_transform(y_train.values), test_data=[(X_val.values, target_transform(y_val.values))]) best_iter = model.best_round y_pred = target_inverse_transform(model.predict(X_val)) y_pred_train = target_inverse_transform(model.predict(X_train)) mae = mean_absolute_error(y_val, y_pred) mae_train = mean_absolute_error(y_train, y_pred_train) return { 'loss': mae, 'mae_train': mae_train, 'status': STATUS_OK, 'best_round': best_iter }
def get_oof(): pred_oob = np.zeros(X_train.shape[0]) pred_test = np.zeros(X_test.shape[0]) for i, (train_index, test_index) in enumerate(kf.split(X_train)): print "Fold = ", i x_tr = X_train[train_index] y_tr = y_train[train_index] x_te = X_train[test_index] y_te = y_train[test_index] pred = np.zeros(x_te.shape[0]) for j in range(nbags): x_tr, y_tr = shuffle(x_tr, y_tr, random_state=RANDOM_STATE + i + j) lgbt_params = { 'exec_path': os.path.expanduser('~/packages/LightGBM/lightgbm' ), # Change this to your LighGBM path 'config': '', 'application': 'regression', 'num_iterations': 3000, 'learning_rate': 0.01, 'num_leaves': 213, 'num_threads': 8, 'min_data_in_leaf': 4, 'metric': 'l1', 'feature_fraction': 0.2933, 'feature_fraction_seed': 2016 + i + j, 'bagging_fraction': 0.9804, 'bagging_freq': 100, 'bagging_seed': 2016 + i + j, 'early_stopping_round': 25, # metric_freq=1, 'verbose': False } clf = GBMRegressor(**lgbt_params) clf.fit(x_tr, y_tr) pred += np.exp(clf.predict(x_te)) pred_test += np.exp(clf.predict(X_test)) pred /= nbags pred_oob[test_index] = pred score = mean_absolute_error(np.exp(y_te), pred) print('Fold ', i, '- MAE:', score) return pred_oob, pred_test
def lgm_evaluate(num_leaves, min_data_in_leaf, feature_fraction, bagging_fraction, bagging_freq): lgm = GBMRegressor(exec_path=exec_path, application='regression', num_iterations=10000, tree_learner='serial', early_stopping_round=50, learning_rate=0.01, num_leaves=round(num_leaves), min_data_in_leaf=round(min_data_in_leaf), feature_fraction=max(feature_fraction, 0), bagging_fraction=max(bagging_fraction, 0), bagging_freq=round(bagging_freq), metric='l2', bagging_seed=RANDOM_STATE, metric_freq=1, verbose=False) kf = KFold(n_folds, shuffle=True, random_state=RANDOM_STATE).get_n_splits(X_train) return -cross_val_score( lgm, X_train, y_train, cv=kf, scoring=make_scorer(evalerror)).mean()
# 'path_to_exec' is the path to lightgbm executable (lightgbm.exe on Windows) path_to_exec = "~/Documents/apps/LightGBM/lightgbm" # for reproducibility np.random.seed(seed) datasets = datasets.load_boston(return_X_y=False) X = datasets['data'] Y = datasets['target'] feature_names = datasets['feature_names'] clf_xgb = XGBRegressor(max_depth=3, n_estimators=1000) clf_gbm = GBMRegressor(exec_path=path_to_exec, num_iterations=1000, learning_rate=0.01, num_leaves=255, min_data_in_leaf=1, early_stopping_round=20, verbose=False) x_train, x_test, y_train, y_test = model_selection.train_test_split( X, Y, test_size=test_size, random_state=seed) # Training the two models clf_gbm.fit(x_train, y_train, test_data=[(x_test, y_test)]) clf_xgb.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='rmse', early_stopping_rounds=20, verbose=False)
@brief: """ import numpy as np from sklearn import datasets, metrics, model_selection from pylightgbm.models import GBMRegressor # Parameters seed = 1337 path_to_exec = "~/Documents/apps/LightGBM/lightgbm" np.random.seed(seed) # for reproducibility X, y = datasets.load_diabetes(return_X_y=True) x_train, x_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=seed) # 'exec_path' is the path to lightgbm executable clf = GBMRegressor(exec_path=path_to_exec, num_iterations=1000, learning_rate=0.01, num_leaves=10, is_training_metric=True, min_data_in_leaf=10, is_unbalance=True, early_stopping_round=10, verbose=True) clf.fit(x_train, y_train, test_data=[(x_test, y_test)]) y_pred = clf.predict(x_test) print("Mean Square Error: ", metrics.mean_squared_error(y_test, y_pred)) print("Best round: ", clf.best_round)
def get_model_obj(modelType, n_clusters=None, **kwargs): if modelType == 'knn': from sklearn.neighbors import KNeighborsClassifier # 6 seems to give the best trade-off between accuracy and precision knn = KNeighborsClassifier(n_neighbors=6, **kwargs) return knn elif modelType == 'gaussianNB': from sklearn.naive_bayes import GaussianNB gnb = GaussianNB(**kwargs) return gnb elif modelType == 'multinomialNB': from sklearn.naive_bayes import MultinomialNB # TODO: figure out how to configure binomial distribution mnb = MultinomialNB(**kwargs) return mnb elif modelType == 'bernoulliNB': from sklearn.naive_bayes import BernoulliNB bnb = BernoulliNB(**kwargs) return bnb elif modelType == 'randomForest': from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(random_state=234, **kwargs) return rfc elif modelType == 'svm': from sklearn.svm import SVC svc = SVC(random_state=0, probability=True, **kwargs) return svc elif modelType == 'LinearRegression': #assert column, "Column name required for building a linear model" #assert dataframe[column].shape == target.shape from sklearn import linear_model l_reg = linear_model.LinearRegression(**kwargs) return l_reg elif modelType == 'RidgeRegression': from sklearn.linear_model import Ridge if not kwargs: kwargs = {'alpha': 0.5} ridge_reg = Ridge(**kwargs) return ridge_reg elif modelType == 'RidgeRegressionCV': from sklearn import linear_model if not kwargs: kwargs = {'alphas': [0.1, 1.0, 10.0]} ridge_cv_reg = linear_model.RidgeCV(**kwargs) return ridge_cv_reg elif modelType == 'LassoRegression': from sklearn import linear_model if not kwargs: kwargs = {'alpha': 0.1} lasso_reg = linear_model.Lasso(**kwargs) return lasso_reg elif modelType == 'ElasticNetRegression': from sklearn.metrics import r2_score from sklearn import linear_model if not kwargs: kwargs = {'alpha': 0.1, 'l1_ratio': 0.7} enet_reg = linear_model.ElasticNet(**kwargs) return enet_reg elif modelType == 'LogisticRegression': from sklearn.linear_model import LogisticRegression log_reg = LogisticRegression(random_state=123, **kwargs) return log_reg elif modelType == 'RANSACRegression': from sklearn.linear_model import LinearRegression, RANSACRegressor ransac_model = RANSACRegressor(LinearRegression()) return ransac_model elif modelType == 'kde': from sklearn.neighbors.kde import KernelDensity kde = KernelDensity(kernel='gaussian', bandwidth=0.2, **kwargs) return kde elif modelType == 'AR': import statsmodels.api as sm # fit an AR model and forecast ar_fitted = sm.tsa.AR(dataframe).fit(maxlag=9, method='mle', disp=-1, **kwargs) #ts_forecast = ar_fitted.predict(start='2008', end='2050') return ar_fitted elif modelType == 'SARIMAX': mod = sm.tsa.statespace.SARIMAX(df.riders, trend='n', order=(0, 1, 0), seasonal_order=(1, 1, 1, 12), **kwargs) return mod elif modelType == 'sgd': # Online classifiers http://scikit-learn.org/stable/auto_examples/linear_model/plot_sgd_comparison.html from sklearn.linear_model import SGDClassifier sgd = SGDClassifier(**kwargs) return sgd elif modelType == 'perceptron': from sklearn.linear_model import Perceptron perceptron = Perceptron(**kwargs) return perceptron elif modelType == 'xgboost': import xgboost as xgb xgbm = xgb.XGBClassifier(**kwargs) return xgbm elif modelType == 'baseNN': from keras.models import Sequential from keras.layers import Dense # create model model = Sequential() assert args.get('inputParams', None) assert args.get('outputParams', None) model.add(Dense(inputParams)) model.add(Dense(outputParams)) if args.get('compileParams'): # Compile model model.compile( compileParams ) # loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model elif modelType == 'lightGBMRegression': from pylightgbm.models import GBMRegressor lgbm_lreg = GBMRegressor(num_iterations=100, early_stopping_round=10, num_leaves=10, min_data_in_leaf=10) return lgbm_lreg elif modelType == 'lightGBMBinaryClass': from pylightgbm.models import GBMClassifier lgbm_bc = GBMClassifier(metric='binary_error', min_data_in_leaf=1) return lgbm_bc # Clustering models elif modelType == 'KMeans': assert n_clusters, "Number of clusters argument mandatory" cluster_callable = KMeans # seed of 10 for reproducibility. clusterer = cluster_callable(n_clusters=n_clusters, random_state=10) return clusterer elif modelType == 'dbscan': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) cluster_callable = DBSCAN clusterer = cluster_callable(eps=0.5) return clusterer elif modelType == 'affinity_prop': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) clusterer = AffinityPropagation(damping=.9, preference=-200) return clusterer elif modelType == 'spectral': assert n_clusters, "Number of clusters argument mandatory" clusterer = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") return clusterer elif modelType == 'birch': if not n_clusters: logging.warn( "Number of clusters irrelevant for cluster type : %s" % (modelType)) clusterer = Birch(n_clusters=2) return clusterer elif modelType == 'agglomerativeCluster': # connectivity matrix for structured Ward connectivity = kneighbors_graph(dataframe, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) clusterer = AgglomerativeClustering(n_clusters=cluster, linkage='ward', connectivity=connectivity) return clusterer elif modelType == 'meanShift': # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(dataframe, quantile=0.3) clusterer = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) return clusterer elif modelType == 'gmm': from sklearn import mixture gmm = mixture.GaussianMixture(n_components=5, covariance_type='full') return gmm elif modelType == 'dgmm': from sklearn import mixture dgmm = mixture.BayesianGaussianMixture(n_components=5, covariance_type='full') return dgmm else: raise 'Unknown model type: see utils.py for available'
# -*- coding: utf-8 -*- """ @author: Ardalan MEHRANI <*****@*****.**> @brief: """ import numpy as np from sklearn import datasets, metrics, model_selection from pylightgbm.models import GBMRegressor # Parameters seed = 1337 np.random.seed(seed) # for reproducibility X, y = datasets.load_diabetes(return_X_y=True) x_train, x_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=seed) # 'exec_path' is the path to lightgbm executable clf = GBMRegressor(exec_path="~/Documents/apps/LightGBM/lightgbm", num_iterations=10000, learning_rate=0.01, num_leaves=10, min_data_in_leaf=10, early_stopping_round=10) clf.fit(x_train, y_train, test_data=[(x_test, y_test)]) y_pred = clf.predict(x_test) print("Mean Square Error: ", metrics.mean_squared_error(y_test, y_pred))
X_train = X.iloc[train].values y_train = y.iloc[train].values X_val = X.iloc[val].values y_val = y.iloc[val].values model = GBMRegressor( num_threads=8, num_iterations=5000, verbose=False, early_stopping_round=25, bagging_seed=2016, metric='l1', learning_rate=0.05, max_depth=12, num_leaves=450, # num_leaves=127, # feature_fraction=params['feature_fraction'], # bagging_fraction=params['bagging_fraction'], feature_fraction=0.7, bagging_fraction=0.7, min_data_in_leaf=450, max_bin=256, # lambda_l1=params['lambda_l1'], # lambda_l2=params['lambda_l2'] ) model.fit(X_train, target_transform(y_train), test_data=[(X_val, target_transform(y_val))]) y_oob[val] = target_inverse_transform(model.predict(X_val))
def test_grid_search(self): param_grid = { 'learning_rate': [0.01, 0.1, 1], 'num_leaves': [2, 5, 50], 'min_data_in_leaf': [1, 10, 100], 'bagging_fraction': [0.1, 1] } params = { 'exec_path': path_to_exec, 'num_threads': 2, 'num_iterations': 100, 'learning_rate': 0.1, 'min_data_in_leaf': 1, 'num_leaves': 10, 'bagging_freq': 2, 'verbose': False } clfs = [ [ Xreg, Yreg, 'regression', GBMRegressor(boosting_type='gbdt', metric='l2', **params) ], [ Xreg, Yreg, 'regression', GBMRegressor(boosting_type='dart', metric='l2', **params) ], [ X, Y, 'classification', GBMClassifier(boosting_type='gbdt', metric='binary_logloss', **params) ], [ X, Y, 'classification', GBMClassifier(boosting_type='dart', metric='binary_logloss', **params) ], ] for x, y, name, clf in clfs: if name == 'regression': scorer = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False) grid = model_selection.GridSearchCV(clf, param_grid, scoring=scorer, cv=2, refit=True) grid.fit(x, y) score = metrics.mean_squared_error(y, grid.predict(x)) print(score) assert score < 2000 else: scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True) grid = model_selection.GridSearchCV(clf, param_grid, scoring=scorer, cv=2, refit=True) grid.fit(x, y) score = metrics.accuracy_score(y, grid.predict(x)) print(score) assert score > .9
from pylightgbm.models import GBMRegressor from sklearn.cross_validation import train_test_split data_loc = "C:/Users/rsoni106/Documents/Work/Methodology Work/Kaggle/Completed/Allstate/prepared_data/new_data" train_data = pd.read_csv(data_loc+"/train_data_py.csv") event = pd.read_csv(data_loc+"/event_py.csv") test_data = pd.read_csv(data_loc+"/test_data_py.csv") vars_model = [x for x in train.columns if x not in ("id","logloss")] # this creates a tuple gbm = GBMRegressor(num_iterations=int(2558/0.9), learning_rate=0.01, num_leaves=200, min_data_in_leaf=8, feature_fraction=0.3, bagging_fraction=0.8, bagging_freq=100, verbose=True, application='regression', metric='l2',num_threads=2, exec_path = ) gbm.fit(train_data,event) from heapq import nlargest from operator import itemgetter temp=0 for k,v in val.items(): if temp == 10: break print(k,v)
y_tr = y[inTr] x_val = xtrain[inTe] y_val = y[inTe] pred = np.zeros(x_val.shape[0]) for j in range(nbags): print 'Bag: ' + str(j) rand_seed = random.randint(1, 5000) gbmr = GBMRegressor( exec_path=path_to_exec, # Change this to your LighGBM path config='', application='regression', num_iterations=int(2558 / 0.9), learning_rate=0.01, num_leaves=200, num_threads=4, min_data_in_leaf=8, metric='l1', feature_fraction=0.3, feature_fraction_seed=rand_seed, bagging_fraction=0.8, bagging_freq=100, bagging_seed=rand_seed, verbose=False) # Train gbmr.fit(x_tr, y_tr, test_data=[(x_val, y_val)]) # Apply to validation and test data print 'Bag: ' + str(j) + " Predicting..." pred += np.exp((gbmr.predict(x_val))) - shift pred_test += np.exp((gbmr.predict(xtest))) - shift
import numpy as np from pylightgbm.models import GBMRegressor from sklearn import datasets, metrics, model_selection # Parameters seed = 1337 path_to_exec = "~/Documents/apps/LightGBM/lightgbm" np.random.seed(seed) # for reproducibility X, Y = datasets.load_diabetes(return_X_y=True) # 'exec_path' is the path to lightgbm executable gbm = GBMRegressor(exec_path=path_to_exec, num_iterations=100, learning_rate=0.1, min_data_in_leaf=1, bagging_freq=10, metric='binary_error', early_stopping_round=10, verbose=False) param_grid = { 'learning_rate': [0.1, 0.04], 'min_data_in_leaf': [1, 10], 'bagging_fraction': [0.5, 0.9] } scorer = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False) clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2) clf.fit(X, Y)
import numpy as np from pylightgbm.models import GBMRegressor from sklearn import datasets, metrics, model_selection # Parameters seed = 1337 np.random.seed(seed) # for reproducibility X, Y = datasets.load_diabetes(return_X_y=True) # 'exec_path' is the path to lightgbm executable gbm = GBMRegressor(exec_path="~/Documents/apps/LightGBM/lightgbm", num_iterations=100, learning_rate=0.1, min_data_in_leaf=1, bagging_freq=10, metric='binary_error', early_stopping_round=10) param_grid = { 'learning_rate': [0.1, 0.04], 'min_data_in_leaf': [1, 10], 'bagging_fraction': [0.5, 0.9] } scorer = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False) clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv=2) clf.fit(X, Y)
validate_features.fillna(0, inplace=True) predict_features.fillna(0, inplace=True) create_feature_map(train_features.columns.tolist(), '{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_fmap_file)) print 'LightGBM Training' seed = 13 gbmr = GBMRegressor( exec_path='/usr/local/lib/python2.7/site-packages/pylightgbm/LightGBM/lightgbm', config='', application='regression', num_iterations=500, learning_rate=0.1, tree_learner='serial', min_data_in_leaf=10, metric='auc', feature_fraction=0.7, feature_fraction_seed=seed, bagging_fraction=1, bagging_freq=10, bagging_seed=seed, metric_freq=1, early_stopping_round=50 ) json.dump(gbmr.param, open('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_params), 'wb+')) gbmr.fit(validate_features.values, validate_labels.values[:, 0], test_data=[(train_features.values, train_labels.values[:, 0])]) importance = dict(gbmr.feature_importance(train_features.columns.tolist())) importance = sorted(importance.items(), key=operator.itemgetter(1)) df = pd.DataFrame(gbmr.feature_importance(train_features.columns.tolist()), columns=['feature', 'importance']) df['importance'] = df['importance'] / df['importance'].sum()
create_feature_map( train_features.columns.tolist(), '{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_fmap_file)) print 'LightGBM Training' seed = 13 gbmr = GBMRegressor( exec_path= '/usr/local/lib/python2.7/site-packages/pylightgbm/LightGBM/lightgbm', config='', application='regression', num_iterations=500, learning_rate=0.1, tree_learner='serial', min_data_in_leaf=10, metric='auc', feature_fraction=0.7, feature_fraction_seed=seed, bagging_fraction=1, bagging_freq=10, bagging_seed=seed, metric_freq=1, early_stopping_round=50) json.dump( gbmr.param, open('{0}_lgbm_{1}{2}'.format(model_path, exec_time, model_params), 'wb+')) gbmr.fit(validate_features.values, validate_labels.values[:, 0], test_data=[(train_features.values, train_labels.values[:, 0])])
oof_train=np.zeros(tr_rows,) seed = 42 nbest=10000 gbmr = GBMRegressor( exec_path='your_LightGBM_exec_path', config='', application='regression', num_iterations=nbest, learning_rate=0.002, #0.03, 0.002 num_leaves=200, #180 tree_learner='serial', num_threads=48, min_data_in_leaf=130, #125 metric='l1', feature_fraction=0.27, #0.75,0.3 feature_fraction_seed=seed, bagging_fraction=0.9, #0.9 bagging_freq=5, #5 bagging_seed=seed, metric_freq=50, verbose=0, #min_hessian= 5, max_bin=850, #850 early_stopping_round=50 #40 ) best=[] score=[] kf = KFold(tr_rows, n_folds=kfolds, shuffle=True,random_state=123)