'x3E', 'x4', 'x5', 'x6' ] df = pd.read_csv('train.csv', index_col=0) df = df.fillna(0) # replace NaN entries df_test = pd.read_csv('test.csv', index_col=0) df_test = df_test.fillna(0) # replace NaN entries weights = [] for index, row in df.iterrows(): weights.append(float(row['Weight'])) X = df[features] Y = df['y'] model = LGBMRegressor() #n_estimators=1000, learning_rate=0.01) param_grid = { 'learning_rate': [0.07, 0.1], 'n_estimators': [10000], 'boosting_type': ['gbdt'], 'min_data_in_leaf': [40], 'num_leaves': [80], 'max_depth': [-1], 'num_iterations': [110] } gbm = GridSearchCV(model, param_grid) gbm.fit(X, Y, sample_weight=weights) print('Best parameters found are:', gbm.best_params_) print('Best score:', gbm.best_score_) print('Feature importances:', list(gbm.best_estimator_.feature_importances_))
lgb_params['sub_feature'] = 0.35 lgb_params['bagging_fraction'] = 0.85 # sub_row lgb_params['bagging_freq'] = 40 lgb_params['num_leaves'] = 512 # num_leaf lgb_params['min_data'] = 500 # min_data_in_leaf lgb_params['min_hessian'] = 0.05 # min_sum_hessian_in_leaf lgb_params['verbose'] = 0 lgb_params['feature_fraction_seed'] = 2 lgb_params['bagging_seed'] = 3 # XGB model xgb_model = XGBRegressor(**xgb_params) # lgb model lgb_model = LGBMRegressor(**lgb_params) # RF model rf_model = RandomForestRegressor(**rf_params) # ET model et_model = ExtraTreesRegressor() # SVR model # SVM is too slow in more then 10000 set #svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.05) # DecsionTree model dt_model = DecisionTreeRegressor() # AdaBoost model
print(test.shape) #(10000,71) x = train[0:, :71] y = train[0:, 71:] print(x.shape) #(10000,71) print(y.shape) #(10000,4) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=66) model = MultiOutputRegressor( LGBMRegressor(n_estimators=1000, learning_rate=0.05, max_depth=-1, colsample_bytree=0.8)) model.fit(x_train, y_train) score = model.score(x_test, y_test) print("R2:", score) # thresholds = np.sort(model.feature_importances_) # 오름차순 정렬(feature_importances정렬) # print(thresholds) # models=[] # res = np.array([]) # for thresh in thresholds: # selection = SelectFromModel(model, threshold=thresh, prefit=True)
def process(msg): att_dict = dict() att_dict['config'] = dict() att_dict['operator'] = 'lightgbm' if api.config.debug_mode == True: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='DEBUG') else: logger, log_stream = slog.set_logging(att_dict['operator'], loglevel='INFO') logger.info("Process started") time_monitor = tp.progress() df = msg.body if not isinstance(df, pd.DataFrame): raise TypeError('Message body does not contain a pandas DataFrame') ###### start of doing calculation model = LGBMRegressor(n_estimators=200, learning_rate=0.03, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40) att_dict['config']['train columns'] = api.config.train_cols train_cols = tfp.read_list(api.config.train_cols, df.columns) att_dict['config']['label'] = api.config.label_col label = tfp.read_value(api.config.label_col) if not label: raise ValueError('Label is mandatory') # cast to categorical dtype for c in df[train_cols].select_dtypes(include='category').columns: unique_num = len(df[c].unique()) nan_num = df[c].isna().count() logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format( c, unique_num, nan_num, df.shape[0])) df[c] = df[c].cat.codes df[c] = df[c].astype('int32') if pd.api.types.is_categorical(df[label]): df[label] = df[label].astype('category') logger.debug('Cast label to <category>') df[label] = df[label].cat.codes df[label] = df[label].astype('int32') print(df.select_dtypes(include='category').head(10)) logger.debug('Train with {} features'.format(len(train_cols))) print(train_cols) model.fit(df[train_cols], df[label], eval_metric='auc') ###### end of doing calculation ############################################## # final infos to attributes and info message ############################################## if df.empty: raise ValueError('DataFrame is empty') logger.info('End of Process: {}'.format(time_monitor.elapsed_time())) att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2 att_dict['columns'] = str(list(df.columns)) att_dict['shape'] = df.shape att_dict['id'] = str(id(df)) logger.debug('Columns: {}'.format(str(df.columns))) logger.debug('Shape (#rows - #columns): {} - {}'.format( df.shape[0], df.shape[1])) logger.debug('Memory: {} kB'.format(att_dict['memory'])) return log_stream.getvalue(), api.Message(attributes=att_dict, body=model)
def crossValidation(XTraining, kfold, modelName, model='default', verbose=False): maeList = [] mapeList = [] rmseList = [] for k in reversed(range(1, kfold+1)): if verbose: print(f'\nKFold Number: {k}') # Start and End Date for Validation startDateValid = XTraining['Date'].max() - datetime.timedelta(days=k*6*7) endDateValid = XTraining['Date'].max() - datetime.timedelta(days=(k-1)*6*7) # Filtering Dataset training = XTraining[XTraining['Date'] < startDateValid] validation = XTraining[(XTraining['Date'] >= startDateValid) & (XTraining['Date'] <= endDateValid)] # Training and Validation Dataset # Training XKFoldTraining = training.drop(['Date', 'Sales'], axis=1) yKFoldTraining = training['Sales'] # Validation XKFoldValidation = validation.drop(['Date', 'Sales'], axis=1) yKFoldValidation = validation['Sales'] # Model ## Model Map modelMap = { 'Linear Regression': LinearRegression(), 'Lasso': Lasso(alpha=0.01), 'Random Forest Regressor': RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42), 'XGBoost Regressor': xgb.XGBRegressor( objective='reg:squarederror', n_estimators=500, eta=0.01, max_depth=10, subsample=0.7, colsample_bytree=0.9), 'Lightgbm Regressor': LGBMRegressor(num_leaves=10, min_data_in_leaf=50, n_jobs=-1, random_state=42, n_estimators=500) } ## Mapped Model if model == 'default': model = modelMap[modelName] else: model = model model.fit(XKFoldTraining, yKFoldTraining) # Prediction yhat = model.predict(XKFoldValidation) #Performance modelResult = mlError('Linear Regression', np.expm1(yKFoldValidation), np.expm1(yhat)) #Store Performance of each KFold iteration maeList.append(modelResult['MAE'].tolist()) mapeList.append(modelResult['MAPE'].tolist()) rmseList.append(modelResult['RMSE'].tolist()) dictResult = { 'Model Name': [modelName], 'MAE CV': [np.round(np.mean(maeList),2).astype(str) + ' +/- ' + np.round(np.std(maeList),2).astype(str)], 'MAPE CV': [np.round(np.mean(mapeList),2).astype(str) + ' +/- ' + np.round(np.std(mapeList),2).astype(str)], 'RMSE CV': [np.round(np.mean(rmseList),2).astype(str) + ' +/- ' + np.round(np.std(rmseList),2).astype(str)] } return pd.DataFrame(dictResult)
# print(new_examDf[new_examDf.isnull() == True].count()) # 检验缺失值,若输出为0,说明该列没有缺失值 # 划分训练数据和测试数据 # X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2) # X_train,X_test,y_train,y_test = train_test_split(features,targets,test_size=0.25) #晓梦 X_train, X_test, y_train, y_test = train_test_split(new_examDf.iloc[:, :34], new_examDf.iloc[:, 34], train_size=0.75, random_state=0) # 模型训练 gbm = LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.1, n_estimators=40) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l1', early_stopping_rounds=5) # 模型存储 joblib.dump(gbm, 'loan_model.pkl') # 模型加载 gbm = joblib.load('loan_model.pkl') # 模型预测 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
def get_model_from_name(model_name, training_params=None, is_hp_search=False): global keras_imported # For Keras epochs = 1000 # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': # print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy') # epochs = 100 all_model_params = { 'LogisticRegression': {}, 'RandomForestClassifier': {'n_jobs': -2, 'n_estimators': 30}, 'ExtraTreesClassifier': {'n_jobs': -1}, 'AdaBoostClassifier': {}, 'SGDClassifier': {'n_jobs': -1}, 'Perceptron': {'n_jobs': -1}, 'LinearSVC': {'dual': False}, 'LinearRegression': {'n_jobs': -2}, 'RandomForestRegressor': {'n_jobs': -2, 'n_estimators': 30}, 'LinearSVR': {'dual': False, 'loss': 'squared_epsilon_insensitive'}, 'ExtraTreesRegressor': {'n_jobs': -1}, 'MiniBatchKMeans': {'n_clusters': 8}, 'GradientBoostingRegressor': {'presort': False, 'learning_rate': 0.1, 'warm_start': True}, 'GradientBoostingClassifier': {'presort': False, 'learning_rate': 0.1, 'warm_start': True}, 'SGDRegressor': {'shuffle': False}, 'PassiveAggressiveRegressor': {'shuffle': False}, 'AdaBoostRegressor': {}, 'LGBMRegressor': {'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384}, 'LGBMClassifier': {'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384}, 'DeepLearningRegressor': {'epochs': epochs, 'batch_size': 50, 'verbose': 2}, 'DeepLearningClassifier': {'epochs': epochs, 'batch_size': 50, 'verbose': 2}, 'CatBoostRegressor': {}, 'CatBoostClassifier': {} } # if os.environ.get('is_test_suite', 0) == 'True': # all_model_params model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if is_hp_search == True: if model_name[:12] == 'DeepLearning': model_params['epochs'] = 50 if model_name[:4] == 'LGBM': model_params['n_estimators'] = 500 if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it) model_params.update(training_params) print('After overwriting our defaults with your values, here are the final params that will be used to initialize the model:') print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans(), } try: model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001) model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001) model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(max_iter=1000, tol=0.001) model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001) model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor(max_iter=1000, tol=0.001) except TypeError: model_map['SGDClassifier'] = SGDClassifier() model_map['Perceptron'] = Perceptron() model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier() model_map['SGDRegressor'] = SGDRegressor() model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor() if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if catboost_installed: model_map['CatBoostRegressor'] = CatBoostRegressor() model_map['CatBoostClassifier'] = CatBoostClassifier() if model_name[:12] == 'DeepLearning': if keras_imported == False: # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead) try: os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from tensorflow import logging logging.set_verbosity(logging.INFO) except: pass global maxnorm global Dense, Dropout global LeakyReLU, PReLU, ThresholdedReLU, ELU global Sequential global keras_load_model global regularizers, optimizers global Activation global KerasRegressor, KerasClassifier from keras.constraints import maxnorm from keras.layers import Activation, Dense, Dropout from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU from keras.models import Sequential from keras.models import load_model as keras_load_model from keras import regularizers, optimizers from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier keras_imported = True model_map['DeepLearningClassifier'] = KerasClassifier(build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor(build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print('It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize') raise(e) if os.environ.get('is_test_suite', False) == 'True': if 'n_jobs' in model_params: model_params['n_jobs'] = 1 model_with_params = model_without_params.set_params(**model_params) return model_with_params
# train_ds = lgb.Dataset(x_train, label = y_train) # test_ds = lgb.Dataset(x_test, label = y_test) # params = {'learning_rate' : 0.01, 'max_depth': 13 , 'boosting' : 'gbdt', # 'objective':'regression', 'metric':'mse', 'is_training_metric':True, # 'num_leaves':144, 'feature_fraction':0.9, 'bagging_fraction':0.7, # 'bagging_freq':5, 'seed':2020} # model = lgb.train(params, train_ds, 1000, test_ds,verbose_eval=100, early_stopping_rounds=100) parameter = { 'num_iterations': 1000, 'learning_rate': 0.05, 'early_stopping_round': 20, } model = LGBMRegressor() model.fit(x_train, y_train, verbose=True, eval_metric="error", eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=100) # rmse, mae, logloss, error, auc y_pred = model.predict(x_test) r2 = r2_score(y_pred, y_test) print(f"r2: {r2}")
def cv(self, nfolds=5, submission=True): self.regressors.clear() self.feature_importance_df = pd.DataFrame() if not submission: folds = data_prepare.get_folds(df=self.x_train, n_splits=nfolds) else: folds = data_prepare.get_folds( df=self.x_train[['totals.pageviews']].reset_index(), n_splits=nfolds) if 'fullVisitorId' in self.x_train.columns: self.x_train.drop('fullVisitorId', axis=1, inplace=True) if 'fullVisitorId' in self.x_test.columns: self.x_test.drop('fullVisitorId', axis=1, inplace=True) #if 'fullVisitorId' in self.y_train.columns: #self.y_train.drop('fullVisitorId', axis=1, inplace=True) oof_preds = np.zeros(self.x_train.shape[0]) preds_test = np.empty((nfolds, self.x_test.shape[0])) self.logfile.write('param: {}\n'.format(self.param)) self.logfile.write('fold: {}\n'.format(nfolds)) self.logfile.write('data shape: {}\n'.format(self.x_train.shape)) self.logfile.write('features: {}\n'.format( self.x_train.columns.tolist())) if self.comment is not None: self.logfile.write('comment: {}\n'.format(self.comment)) self.logfile.write('output: ../output/{}.csv\n'.format(self.name)) self.logfile.flush() for n_fold, (train_idx, valid_idx) in enumerate(folds): fstart = time.time() train_x, train_y = self.x_train.iloc[train_idx], self.y_train.iloc[ train_idx] valid_x, valid_y = self.x_train.iloc[valid_idx], self.y_train.iloc[ valid_idx] # lgbRegressor parameters found by Bayesian optimization clf = LGBMRegressor(**self.param) clf.fit(train_x, np.log1p(train_y), eval_set=[(valid_x, np.log1p(valid_y))], eval_metric='rmse', verbose=100, early_stopping_rounds=200) oof_preds[valid_idx] = clf.predict( valid_x, num_iteration=clf.best_iteration_) preds_test[n_fold, :] = clf.predict( self.x_test, num_iteration=clf.best_iteration_) #remove negative and transform un log oof_preds[oof_preds < 0] = 0 preds_test[preds_test < 0] = 0 fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = self.x_train.columns.tolist() fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 self.feature_importance_df = pd.concat( [self.feature_importance_df, fold_importance_df], axis=0) strlog = '[{}][{:.1f} sec] Fold {} RMSE : {:.6f}'.format( str(datetime.now()), time.time() - fstart, n_fold + 1, mean_squared_error(np.log1p(valid_y), oof_preds[valid_idx])**.5) print(strlog) self.logfile.write(strlog + '\n') self.logfile.flush() self.regressors.append(clf) del clf, train_x, train_y, valid_x, valid_y gc.collect() full_rmse = mean_squared_error(np.log1p(self.y_train), oof_preds)**.5 strlog = 'Full RMSE score {:.6f}'.format(full_rmse) print(strlog) self.logfile.write(strlog + '\n') preds = preds_test.mean(axis=0) if submission: #sub = pd.read_csv('../input/sample_submission.csv') #sub['PredictedLogRevenue'] = preds preds.to_csv('../output/submission/{}.csv'.format(self.name), index=True) cols = self.feature_importance_df[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:100].index self.logfile.write('top features:\n') for c in cols: self.logfile.write('{}\n'.format(c)) self.logfile.flush() self.display_importances(self.feature_importance_df, self.name) # for stack np.save('../output/feats/{}_trn_prd_feats'.format(self.name), oof_preds) np.save('../output/feats/{}_tes_prd_feats'.format(self.name), preds) return self.feature_importance_df, full_rmse, oof_preds, preds
def find_regression_model(X_train, X_test, y_train, y_test, ensembling=True): models = [] overall_accuracies = [] if ensembling: print("Ensembling is enabled.") learning_rate_list = [0.1, 0.01, 0.001] gamma_list = [0, 1, 5] colsample_bytree_list = [0.3, 0.5, 0.8, 1] # Creating list of cv scores scores = [] params = [] for lr in learning_rate_list: for g in gamma_list: for cb in colsample_bytree_list: xgb = XGBRegressor(learning_rate=lr, gamma=g, colsample_bytree=cb, objective="reg:squarederror") xgb.fit(X_train, y_train) y_pred = xgb.predict(X_test) scores.append(explained_variance_score(y_test, y_pred)) params.append([lr, g, cb]) XGB_max_scores = max(scores) print( "XGBoost regressor explained variance score is:", XGB_max_scores, "with the following values\nfor learning rate, g, and number of columns used by each tree:", params[scores.index(XGB_max_scores)]) lr_best = params[scores.index(XGB_max_scores)][0] g_best = params[scores.index(XGB_max_scores)][1] cb_best = params[scores.index(XGB_max_scores)][2] models.append( XGBRegressor(learning_rate=lr_best, gamma=g_best, colsample_bytree=cb_best, objective="reg:squarederror")) overall_accuracies.append(XGB_max_scores) # Use LightGBM learning_rate_list = [0.1, 0.01, 0.001] colsample_bytree_list = [0.3, 0.5, 0.8, 1] # Creating list of cv scores scores = [] params = [] for lr in learning_rate_list: for cb in colsample_bytree_list: lgbm = LGBMRegressor(learning_rate=lr, colsample_bytree=cb) lgbm.fit(X_train, y_train) y_pred = lgbm.predict(X_test) scores.append(explained_variance_score(y_test, y_pred)) params.append([lr, cb]) LGBM_max_scores = max(scores) print("LightGBM regressor explained variance score is:", LGBM_max_scores, "with the following values\nfor learning rate and objective:", params[scores.index(LGBM_max_scores)]) lr_best = params[scores.index(LGBM_max_scores)][0] cb_best = params[scores.index(LGBM_max_scores)][1] models.append( LGBMRegressor(learning_rate=lr_best, colsample_bytree=cb_best)) overall_accuracies.append(LGBM_max_scores) else: print("Ensembling is disabled.") alpha_list = [100, 50, 25, 10, 5, 1, 0.75, 0.5, 0.25, 0.1, 1e-5] max_iter_list = [100000, 50000, 10000, 5000, 10000] # Use lasso regression scores = [] params = [] for a in alpha_list: for i in max_iter_list: lasso = Lasso(alpha=a, max_iter=i) lasso.fit(X_train, y_train) y_pred = lasso.predict(X_test) scores.append(explained_variance_score(y_test, y_pred)) params.append([a, i]) lasso_max_scores = max(scores) print("Lasso model explained variance score is:", lasso_max_scores, "with the values for alpha and max iterations: ", params[scores.index(lasso_max_scores)]) a_best = params[scores.index(lasso_max_scores)][0] i_best = params[scores.index(lasso_max_scores)][1] models.append(Lasso(alpha=a_best, max_iter=i_best)) overall_accuracies.append(lasso_max_scores) # Use ridge regression scores = [] params = [] for a in alpha_list: for i in max_iter_list: ridge = Ridge(alpha=a, max_iter=i) ridge.fit(X_train, y_train) y_pred = ridge.predict(X_test) scores.append(explained_variance_score(y_test, y_pred)) params.append([a, i]) ridge_max_scores = max(scores) print("Ridge model explained variance score is:", ridge_max_scores, "with the values for alpha and max iterations: ", params[scores.index(ridge_max_scores)]) a_best = params[scores.index(ridge_max_scores)][0] i_best = params[scores.index(ridge_max_scores)][1] models.append(Ridge(alpha=a_best, max_iter=i_best)) overall_accuracies.append(ridge_max_scores) # Use elastic net for a in alpha_list: for i in max_iter_list: try: elastic = ElasticNet(alpha=a, max_iter=i, l1_ratio=0.5) elastic.fit(X_train, y_train) y_pred = elastic.predict(X_test) scores.append(explained_variance_score(y_test, y_pred)) params.append([a, i]) except: continue elastic_max_scores = max(scores) a_best = params[scores.index(elastic_max_scores)][0] i_best = params[scores.index(elastic_max_scores)][1] l1_best = 0.5 models.append(ElasticNet(alpha=a_best, max_iter=i_best, l1_ratio=0.5)) overall_accuracies.append(elastic_max_scores) print("Elastic net regression explained variance score is:", elastic_max_scores, "with the values\nfor alpha, max iterations, and l1 ratio: ", params[scores.index(elastic_max_scores)]) # Use linear regression linear = LinearRegression() linear.fit(X_train, y_train) y_pred = linear.predict(X_test) print("Linear regression explained variance score is:", explained_variance_score(y_test, y_pred), "using\n", linear) models.append(LinearRegression()) overall_accuracies.append(explained_variance_score(y_test, y_pred)) # Use SVM # Creating lists of gamma and c for SVM gamma_list = [1e-3, 1e-5, 1e-7, 1e-9] c_list = [1, 10, 100, 1000] kernel_list = ["linear", "rbf", "poly"] # Creating list of cv scores scores = [] params = [] # Perform gridsearch on SVR model for c in c_list: for g in gamma_list: for k in kernel_list: svr = SVR(gamma=g, C=c, kernel=k) svr.fit(X_train, y_train) y_pred = svr.predict(X_test) scores.append(explained_variance_score(y_test, y_pred)) params.append([g, c, k]) SVR_max_scores = max(scores) print("Support vector regressor explained variance score is:", SVR_max_scores, "with the following values\nfor g, c, and k:", params[scores.index(SVR_max_scores)]) g_best = params[scores.index(SVR_max_scores)][0] c_best = params[scores.index(SVR_max_scores)][1] k_best = params[scores.index(SVR_max_scores)][2] models.append(SVR(gamma=g_best, C=c_best, kernel=k_best)) overall_accuracies.append(SVR_max_scores) top_accuracy = max(overall_accuracies) best_model = models[overall_accuracies.index(top_accuracy)] print( "\nThe best model found for the regression problem on the dataset is:\n", best_model)
def train(save_pickles=True): # --- establish SQL Connection --- SQL = tycho.PostgreSQLCon() SQL.make_con() # --- Read in ETL Pickle --- merged = SQL.sql_to_pandas('etl_L3') # --- Sanitize --- ColumnSanitize = tycho.ColumnSanitizer() clean = ColumnSanitize.sanitize(merged) # --- Create average lookup tables --- avg_table = tycho.calc_average_y_vals_per_MW(clean) # --- Split --- Splitter = tycho.FourWaySplit() X_train_df, X_test_df, y_train_all, y_test_all = Splitter.split(clean) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~ Pipeline ~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pandas_pipe = Pipeline(steps=[ ('capacity', tycho.CapacityFeatures()), ('date', tycho.DateFeatures()), ('avg_values', tycho.ApplyAvgY(avg_table)), ('dropnull', tycho.DropNullColumns()), ('onehot', tycho.OneHotEncodeWithThresh()), ]) numpy_pipe = Pipeline(steps=[ ('imputer', SimpleImputer()), ('scaler', tycho.LowMemoryMinMaxScaler()), ]) preprocess_pipe = Pipeline(steps=[ ('pd', pandas_pipe), ('np', numpy_pipe), ]) # --- Fit/transform --- X_train = preprocess_pipe.fit_transform(X_train_df) X_test = preprocess_pipe.transform(X_test_df) # --- Create complete dfs for output --- train_out_df = X_train_df[[ 'datetime_utc', 'plant_id_wri', 'estimated_generation_gwh', 'primary_fuel' ]] train_out_df = pd.concat([train_out_df, y_train_all], axis='columns') test_out_df = X_test_df[[ 'datetime_utc', 'plant_id_wri', 'estimated_generation_gwh', 'primary_fuel' ]] test_out_df = pd.concat([test_out_df, y_test_all], axis='columns') # --- output preprocessing pipe --- if save_pickles: out_path = os.path.join('models', config.TRAIN_MODEL) with open(os.path.join(out_path, 'pipe.pkl'), 'wb') as handle: pickle.dump(preprocess_pipe, handle) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~ Train Model ~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ for y_col in config.ML_Y_COLS: log.info('\n') log.info(f'....beginning fit for {y_col} using {config.TRAIN_MODEL}') # --- Subset y --- y_train = np.array(y_train_all[y_col]) y_test = np.array(y_test_all[y_col]) # --- Initialize Model --- if config.TRAIN_MODEL == 'lr': model = LinearRegression(fit_intercept=True, normalize=False, n_jobs=-1) elif config.TRAIN_MODEL == 'bayes-lgbm': estimator = LGBMRegressor( random_state=1, n_jobs=12, verbose=-1, num_iterations=1000, boosting_type=None, learning_rate=0.03, subsample=0.7, boosting='dart', ) lgbm_pbounds = { # 'boosting':['gbdt','dart'], # 'learning_rate': (0.01, 1.), # 'n_estimators': (2, 2000), 'max_depth': (3, 12), # 'min_child_weight': (0., 100.), # 'min_data_in_leaf' : (1, 40), 'num_leaves': ( 2, 2000 ), # large num_leaves helps improve accuracy but might lead to over-fitting # 'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart 'objective': ['rmse', 'mae', 'tweedie'], 'max_bin': ( 128, 10000 ), # large max_bin helps improve accuracy but might slow down training progress # 'colsample_bytree' : (0.3,1), # 'subsample' : (0.3, 1.), # 'reg_alpha' : (0., 300.), # 'reg_lambda' : (0., 300.), } model = tycho.BayesRegressor(estimator=estimator, pbounds=lgbm_pbounds) elif config.TRAIN_MODEL == 'bayes-xgb': estimator = XGBRegressor(random_state=1, nthread=12, tree_method='gpu_hist', single_precision_histogram=True, validate_paramters=True) xgb_pbounds = { 'booster': ['dart', 'gbtree', 'gblinear'], 'max_depth': (3, 11), # 'learning_rate': (0.1, 0.5), 'subsample': (0.1, 1.), # 'sampling_metod':['uniform','gradient_based'], 'colsample_bytree': (0.1, 1.), # 'colsample_bylevel': (0.1, 1.), 'max_bin': ( 2, 10000 ), # large max_bin helps improve accuracy but might slow down training progress # 'grow_policy':['depthwise','lossguide'], # 'min_child_weight': (0., 100), 'reg_alpha': (0., 250.), 'reg_lambda': (0., 250.), 'gamma': (0., 10.), # 'objective': ['reg:tweedie'], } model = tycho.BayesRegressor(estimator=estimator, pbounds=xgb_pbounds) # --- Fit --- model.fit(X_train, y_train) # --- Get best estimator --- y_train_pred = model.predict(X_train) log.info( f'........best train MAE for {y_col}: {mae(y_train, y_train_pred)}' ) log.info( f'........best train mape for {y_col}: {mape(y_train, y_train_pred)}' ) log.info( f'........average value for {y_col} is {y_train.mean()}, MAE as a percent is {mae(y_train, y_train_pred) / y_train.mean()}' ) # --- Predict on test --- y_pred = model.predict(X_test) log.info(f'........best test mae for {y_col}: {mae(y_test, y_pred)}') log.info(f'........best test mape for {y_col}: {mape(y_test, y_pred)}') log.info( f'........average value for {y_col} is {y_test.mean()}, MAE as a percent is {mae(y_test, y_pred) / y_test.mean()}' ) if save_pickles: if config.TRAIN_MODEL == 'tpot': # --- Output model pipeline --- model.export( os.path.join(out_path, f'tpot_best_pipe_{y_col}.py')) best = model.fitted_pipeline_ with open( os.path.join( out_path, f'model_{y_col}_{config.TRAIN_MODEL}.pkl'), 'wb') as handle: pickle.dump(best, handle) else: # --- Output model --- with open( os.path.join( out_path, f'model_{y_col}_{config.TRAIN_MODEL}.pkl'), 'wb') as handle: pickle.dump(model, handle) # --- save predictions to out dfs --- train_out_df[f'pred_{y_col}'] = y_train_pred test_out_df[f'pred_{y_col}'] = y_pred return train_out_df, test_out_df
def create_model(hyperparams): from lightgbm import LGBMRegressor return LGBMRegressor(**hyperparams)
gc.collect() drop_cols = ["codmes"] test_preds = [] train_preds = [] y_train["target"] = y_train["margen"].astype("float32") for mes in X_train.codmes.unique(): print("*" * 10, mes, "*" * 10) Xt = X_train[X_train.codmes != mes] yt = y_train.loc[Xt.index, "target"] Xt = Xt.drop(drop_cols, axis=1) Xv = X_train[X_train.codmes == mes] yv = y_train.loc[Xv.index, "target"] learner = LGBMRegressor(n_estimators=5000) learner.fit(Xt, yt, early_stopping_rounds=50, eval_metric="mae", eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)], verbose=50) gc.collect() test_preds.append( pd.Series(learner.predict(X_test.drop(drop_cols, axis=1)), index=X_test.index, name="fold_" + str(mes))) train_preds.append( pd.Series(learner.predict(Xv.drop(drop_cols, axis=1)), index=Xv.index, name="probs"))
############################################# from lightgbm import LGBMRegressor lgbm = LGBMRegressor() lgbm_param_grid = {# LightGBM #n_estimators, max_depth, num_leaves, sub_sample, colsample_bytree 'n_estimators': [50], 'learning_rate': [0.1], 'colsample_bytree': [0.6], 'max_depth': [-1], 'num_leaves': [31], 'reg_alpha': [1.5], 'reg_lambda': [0], 'min_split_gain': [0], 'subsample': [0.2], 'subsample_freq': [0] }
"""Module that contains the model configuration used in the training pipeline.""" from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklego.preprocessing import ColumnSelector from sklearn.compose import ColumnTransformer from lightgbm import LGBMRegressor from src.config import config RUN_NAME = "lightgbm" #Prepare pipeline numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())]) categorical_transformer = OneHotEncoder(handle_unknown='ignore') column_selector = ColumnSelector(config.FEATURES) preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, config.NUMERIC_FEATURES), ('cat', categorical_transformer, config.CATEGORICAL_FEATURES)]) #Create model xgb_model = LGBMRegressor() model = Pipeline([('column_selector', column_selector), ("preprocessor", preprocessor), ("regressor", xgb_model)])
def test_lightgbm_regressor2(self): model = LGBMRegressor(n_estimators=2, max_depth=1, min_child_samples=1) dump_single_regression(model, suffix="2")
# XGB is tuned using the result obtained from xsede2.py "xgb": XGBRegressor(n_estimators=1000, learning_rate=0.01, min_child_weight=3, max_depth=6, gamma=0, subsample=0.8, colsample_bytree=0.85, random_state=0), "lgb": LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11, random_state=0), "rf": RandomForestRegressor(n_estimators=1000, bootstrap=True, max_features='sqrt', max_depth=6, min_samples_split=3, min_samples_leaf=1, random_state=0), "knn": KNeighborsRegressor(n_neighbors = 10), "ada": AdaBoostRegressor(n_estimators=1000,
catb_params, cv=5, n_jobs=-1, verbose=2).fit(X_train, y_train) catb_cv_model.best_params_ = { 'depth': 3, 'iterations': 500, 'learning_rate': 0.1 } # Final Model catb_tuned = CatBoostRegressor(**catb_cv_model.best_params_).fit( X_train, y_train) np.sqrt(mean_squared_error(y_test, y_pred)) # LightGBM: Model & Tahmin lgb_model = LGBMRegressor().fit(X_train, y_train) y_pred = lgb_model.predict(X_test) np.sqrt(mean_squared_error(y_test, y_pred)) # Model Tuning lgb_model = LGBMRegressor() lgbm_params = { "learning_rate": [0.01, 0.001, 0.1, 0.5, 1], "n_estimators": [200, 500, 1000, 5000], "max_depth": [6, 8, 10, 15, 20], "colsample_bytree": [1, 0.8, 0.5, 0.4] } lgbm_cv_model = GridSearchCV(lgb_model, lgbm_params, cv=10, n_jobs=-1, verbose=2).fit(X_train, y_train)
보통 learning_rate와 n_estimators는 같이 움직인다. scikit-learn 패키지가 아니므로 GPU버전으로 설치한다면 GPU 사용도 가능하다. """ ## LightGBM """ 주요 특징 scikit-learn 패키지가 아니다. 성능이 우수하다 속도도 매우 빠르다. """ from lightgbm import LGBMRegressor, LGBMClassifier lgbm = LGBMRegressor(random_state=42, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.8, subsample=0.8, max_depth=7) lgbm.fit(x_train, y_train) lgbm_pred = lgbm.predict(x_test) mse_eval('LGBM Ensemble', lgbm_pred, y_test) """ # 주요 Hyperparameter - random_state : 랜덤 시드 고정 값. 고정해두고 튜닝할 것 - n_jobs : CPU 사용 갯수 - learning_rate : 학습율. 너무 큰 학습율은 성능을 떨어뜨리고, 너무 작은 학습율은 학습이 느리다. 적절한 값을 찾아야 함. default = 0.1 - n_estimators : 부스팅 스테이지 수, default값은 100개 - max_depth : 트리의 깊이. 과대적합 방지용. default = 3 - colsample_bytree : 샘플 사용 비율(max_features와 비슷한 개념). 과대적합 방지용. default=1.0 보통 learning_rate와 n_estimators는 같이 움직인다.
xgb =XGBRegressor( booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6, gamma=0, importance_type='gain', learning_rate=0.01, max_delta_step=0, max_depth=4, min_child_weight=1.5, n_estimators=2400, n_jobs=1, nthread=None, objective='reg:linear', reg_alpha=0.6, reg_lambda=0.6, scale_pos_weight=1, silent=None, subsample=0.8, verbosity=1) lgbm = LGBMRegressor(objective='regression', num_leaves=4, learning_rate=0.01, n_estimators=12000, max_bin=200, bagging_fraction=0.75, bagging_freq=5, bagging_seed=7, feature_fraction=0.4, ) #Fitting xgb.fit(x_train, y_train) lgbm.fit(x_train, y_train,eval_metric='rmse') predict1 = xgb.predict(x_test) predict = lgbm.predict(x_test)
# In[174]: cv_rmse(svr_fit).mean() """ from lightgbm import LGBMRegressor lgbm_model = LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.1, n_estimators=200, max_bin=100, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.8, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=20, min_sum_hessian_in_leaf=11) lgbm_fit = lgbm_model.fit(X_train, y_train) # In[198]: cv_rmse(lgbm_fit).mean() from mlxtend.regressor import StackingCVRegressor from sklearn.pipeline import make_pipeline #setup models
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=42) lightgbm = LGBMRegressor( objective='regression', num_leaves=4, learning_rate=0.01, n_estimators=5000, max_bin=200, bagging_fraction=0.75, bagging_freq=5, bagging_seed=7, feature_fraction=0.2, feature_fraction_seed=7, verbose=-1, ) xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:linear',
'colsample_bytree': [0.6, 0.7, 0.8], 'n_estimators': [500, 1000], 'random_state': [0] }, 'lightGBM': { 'min_data_in_leaf': [100, 300, 500, 1000, 1500], 'num_leaves': [15, 30, 40, 50, 60], 'max_depth': [15, 30, 45], 'random_state': [0] } } model_map = { 'random_forest': RandomForestRegressor(), 'xgboost': XGBRegressor(), 'lightGBM': LGBMRegressor() } opt = docopt(__doc__) # Label-encode the categorical features and split the data def preprocess(full_train, full_test): X_train = full_train.drop(['price'], axis=1) y_train = full_train['price'] X_test = full_test.drop(['price'], axis=1) y_test = full_test['price'] for feature in categorical_features: le = LabelEncoder() le.fit(X_train[feature])
X_train, X_test, y_train, y_test = train_test_split(reg_features, reg_target, test_size=0.25, random_state=42) ''' ''' reg_features = data_reg_process(reg_features) reg = LGBMRegressor(num_leaves=40, max_depth=7, n_estimators=10000, min_child_weight=10, subsample=0.7, colsample_bytree=0.7, reg_alpha=0, learning_rate=0.1, reg_lambda=0.5, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9) ''' 实际回归建模 ''' reg.fit(reg_features, reg_target, eval_set=[(reg_features, reg_target)], eval_metric='rmse', early_stopping_rounds=100)
#! /usr/env/python import pandas as pd import numpy as np from sklearn.metrics import mean_squared_log_error from lightgbm import LGBMRegressor features = ['Market', 'Day', 'Stock', 'x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4', 'x5', 'x6'] df = pd.read_csv('train.csv', index_col=0) df = df.fillna(0) # replace NaN entries df_test = pd.read_csv('test.csv', index_col=0) df_test = df_test.fillna(0) # replace NaN entries weights = [] for index, row in df.iterrows(): weights.append(float(row['Weight'])) X = df[features] Y = df['y'] model = LGBMRegressor(n_estimators=10000, learning_rate=0.01, min_data_in_leaf=40, num_leaves=80, num_iterations=110) model.fit(X,Y, sample_weight = weights) yp = pd.Series(model.predict(df_test[features])).rename('y') yp.index.name = 'Index' print(yp.head()) yp.to_csv('GradientBoostedRegressor4c.csv', header=True)
wlist = {'train': d_train, 'eval': d_val} model = lgbm.train(params=params, train_set=d_train, valid_sets=d_val, evals_result=wlist) models.append(model) return models final_y_test_pred = [] final_y_pred = [] # 모델 컬럼별 4번 for i in range(4): model = LGBMRegressor(**params) model.fit(x_train, y_train[:, i], eval_set=[(x_train, y_train[:, i]), (x_test, y_test[:, i])], verbose=True) y_test_pred = model.predict(x_test) score = model.score(x_test, y_test[:, i]) mae = MAE(y_test[:, i], y_test_pred) print("r2 : ", score) print("mae :", mae) thresholds = np.sort(model.feature_importances_)[[ i for i in range(0, len(model.feature_importances_), 30) ]] print("model.feature_importances_ : ", model.feature_importances_)
dtest = xgb.DMatrix(test) predictions = final_gb.predict(dtest) pd.DataFrame(predictions, columns=['Fees']).to_csv('prediction_DOC_XGBCV_14.csv') ############################################################################ lightgbm = LGBMRegressor(objective='regression', num_leaves=450, learning_rate=0.1, n_estimators=1200, max_bin=30, bagging_fraction=0.8, bagging_freq=9, feature_fraction=0.129, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=3, min_sum_hessian_in_leaf=6, random_state=10) xgb = XGBRegressor(learning_rate=0.1, n_estimators=1500, max_depth=30, min_child_weight=12, gamma=0, reg_alpha=2e-5, subsample=0.8, colsample_bytree=0.8,
y = data['Airfare(NZ$)'] X = data.drop(['Airfare(NZ$)'], axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # Create cross-validation cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=24) # Define a scoring system def mse(y, y_pred): return mean_squared_error(y, y_pred) # Define models lgbm = LGBMRegressor( learning_rate=0.1, max_bin=150, boosting_type='goss' ) svr = SVR(kernel='rbf') rf = RandomForestRegressor(n_jobs=-1, oob_score=True) # Grid-search param_grid_lgbm = { 'num_leaves': [80], 'max_depth': [7, 10], 'n_estimators': [200], 'min_data_in_leaf': [100, 300] }
def test_lightgbm_regressor(self): model = LGBMRegressor(n_estimators=3, min_child_samples=1) dump_single_regression(model)
def run(dt, rttFlag=True, method="xgboost", dir="../../data/throughputRelation/data/1"): train = pd.read_csv(dir + "/train/sample.csv", delimiter=' ') test = pd.read_csv(dir + "/test/sample.csv", delimiter=' ') test2 = pd.read_csv(dir + "/test/sample.csv", delimiter=' ') len_train = train.shape[0] datas = pd.concat([train, test], sort=False) if HIT_FLAG == True: if rttFlag: train.drop(['hTime', 'maxMT', 'reqCount', 'avgMT', 'totalMT'], axis=1, inplace=True) test.drop(['hTime', 'maxMT', 'reqCount', 'avgMT', 'totalMT'], axis=1, inplace=True) else: train.drop( ['hTime', 'rtt', 'maxMT', 'reqCount', 'avgMT', 'totalMT'], axis=1, inplace=True) test.drop( ['hTime', 'rtt', 'maxMT', 'reqCount', 'avgMT', 'totalMT'], axis=1, inplace=True) else: if rttFlag: train.drop(['mTime', 'maxMT', 'reqCount', 'avgMT', 'totalMT'], axis=1, inplace=True) test.drop(['mTime', 'maxMT', 'reqCount', 'avgMT', 'totalMT'], axis=1, inplace=True) else: train.drop( ['mTime', 'rtt', 'maxMT', 'reqCount', 'avgMT', 'totalMT'], axis=1, inplace=True) test.drop( ['mTime', 'rtt', 'maxMT', 'reqCount', 'avgMT', 'totalMT'], axis=1, inplace=True) len_train = train.shape[0] datas = pd.concat([train, test], sort=False) # # skew_ = datas.select_dtypes(include=['int', 'float']).apply(lambda x: skew(x.dropna())).sort_values(ascending=False) # skew_df = pd.DataFrame({'Skew': skew_}) # skewed_df = skew_df[(skew_df['Skew'] > 0.5) | (skew_df['Skew'] < -0.5)] # # print(skewed_df.index) if HIT_FLAG == True: if rttFlag: skew_column = ['mTime', 'size', 'rtt', 'mThroughput'] else: skew_column = ['mTime', 'size', 'mThroughput'] else: if rttFlag: skew_column = ['hTime', 'size', 'rtt', 'hThroughput'] else: skew_column = ['hTime', 'size', 'hThroughput'] lam = 0.1 for col in skew_column: train[col] = boxcox1p(train[col], lam) test[col] = boxcox1p(test[col], lam) if HIT_FLAG == True: train['hThroughput'] = np.log(train['hThroughput']) x = train.drop('hThroughput', axis=1) y = train['hThroughput'] x_test = test.drop('hThroughput', axis=1) else: train['mThroughput'] = np.log(train['mThroughput']) x = train.drop('mThroughput', axis=1) y = train['mThroughput'] x_test = test.drop('mThroughput', axis=1) print(x.columns) if method == "lasso": model = Lasso(max_iter=1e7, alpha=0.0001, random_state=1) elif method == "ridge": model = Ridge(alpha=14.5) elif method == "lgbm": model = LGBMRegressor( objective='regression', max_depth=6, num_leaves=4, learning_rate=0.05, n_estimators=5000, max_bin=200, bagging_fraction=0.75, bagging_freq=5, bagging_seed=7, feature_fraction=0.2, feature_fraction_seed=7, verbose=-1, ) elif method == "LinearRegression": model = LinearRegression() else: # model = XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=500, silent=False, objective='reg:gamma') model = XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=5000, silent=False, objective='reg:gamma') startTime = time.time() model.fit(x, y) endTime = time.time() trainTime = endTime - startTime if HIT_FLAG: modelDir = dir + "/m2h/model/" + dt else: modelDir = dir + "/h2m/model/" + dt if rttFlag: modelDir += "/rtt" else: modelDir += "/nortt" if os.path.exists(modelDir) == False: os.makedirs(modelDir) joblib.dump(model, modelDir + "/" + method + ".m") startTime = time.time() y_pred = model.predict(x_test) endTime = time.time() plt.rcParams['figure.figsize'] = (4.0, 4.0) fig, ax = plt.subplots() if HIT_FLAG: y_test = test2['hThroughput'] / 1024 / 1024 else: y_test = test2['mThroughput'] / 1024 / 1024 y_pred = np.exp(y_pred) / 1024 / 1024 # ax.scatter(y_test, y_pred, s=2, alpha=0.1) # ax.plot([y_pred.min(), y_pred.max()], [y_pred.min(), y_pred.max()], 'k--', lw=4) # ax.set_xlabel('Real throughput/Mbps') # ax.set_ylabel('Predicted throughput/Mbps') # plt.title(method) # plt.savefig("../plot/"+method+".pdf", bbox_inches = 'tight') # plt.show() MSE_loss = np.average(np.square(y_pred - y_test)) print(method, "MSE_loss =", MSE_loss, "testTime=", endTime - startTime, "trainTime=", trainTime) return method, MSE_loss, endTime - startTime, trainTime
ica2_results_test = ica.transform(test) # Append decomposition components to datasets for i in range(1, n_comp+1): train['pca_' + str(i)] = pca2_results_train[:,i-1] test['pca_' + str(i)] = pca2_results_test[:, i-1] train['ica_' + str(i)] = ica2_results_train[:,i-1] test['ica_' + str(i)] = ica2_results_test[:, i-1] y = np.array(train["y"]) X=np.array(train.drop('y', axis=1)) #LightGBM Regressor model = LGBMRegressor(boosting_type='gbdt', num_leaves=10, max_depth=4, learning_rate=0.005, n_estimators=675, max_bin=25, subsample_for_bin=50000, min_split_gain=0, min_child_weight=5, min_child_samples=10, subsample=0.995, subsample_freq=1, colsample_bytree=1, reg_alpha=0, reg_lambda=0, seed=0, nthread=-1, silent=True) #Fit to training data model.fit(X, y) # Generate Predictions test['y']=model.predict(test) # output=test[['ID', 'y']] # # #Save predictions to 'output.csv' # output.to_csv('output.csv', index=False)