def main(args): cnn_model = build_cnn_model(backbone=MobileNetV2, use_imagenet=None) gbm_model = LGBMClassifier( boosting_type='gbdt', objective='binary', n_jobs=3, # Updated from 'nthread' silent=False, max_depth=params['max_depth'], max_bin=params['max_bin'], subsample_for_bin=params['subsample_for_bin'], subsample=params['subsample'], subsample_freq=params['subsample_freq'], min_split_gain=params['min_split_gain'], min_child_weight=params['min_child_weight'], min_child_samples=params['min_child_samples'], scale_pos_weight=params['scale_pos_weight']) if use_nsml: bind_nsml(cnn_model, gbm_model) if args.pause: nsml.paused(scope=locals()) if (args.mode == 'train'): #train_loader, dataset_sizes = get_data_loader(root=os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data'), phase='train', batch_size=args.batch_size) start_time = datetime.datetime.now() TotalX = np.load('TrainX.npy') TotalY = np.load('TrainY.npy') print('TotalX.shape', TotalX.shape, 'TotalY.shape', TotalY.shape) X_train, X_test, Y_train, Y_test = train_test_split(TotalX, TotalY, test_size=0.05, random_state=777) print('X_train.shape', X_train.shape, 'X_test.shape', X_test.shape, 'Y_train.shape', Y_train.shape, 'Y_test.shape', Y_test.shape) # To view the default model params: gbm_model.get_params().keys() eval_set = (X_test, Y_test) gbm_model.fit( X_train, Y_train, ) gbm_model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], eval_metric='binary_error', early_stopping_rounds=50) nsml.save('last')
def do_generate_metrics_lgbm_optimazed_model(X_train, y_train, X_test, y_test, grid): model = LGBMClassifier(random_state=0) model.set_params(**grid.best_params_) model.fit(X_train, y_train) metrics = calculate_metrics(model, X_test, y_test) print(model.get_params(), " ", model.score) print(grid.best_params_, " ", grid.best_score_) return model, metrics
def do_generate_metrics_lgbm_optimazed_model(X_train, y_train, X_test, y_test, grid): file_operations.write_logs(FILENAME, "LGBM metrics calculation\n") model = LGBMClassifier(random_state=0) model.set_params(**grid.best_params_) model.fit(X_train, y_train) metrics = calculate_metrics(model, X_test, y_test) file_operations.write_logs( FILENAME, "Generated model params and results\n params:" + str(model.get_params()) + "\nscore " + str(model.score(X_test, y_test))) file_operations.write_logs( FILENAME, "Search grid best params and results\n params:" + str(grid.best_params_) + "\nscore " + str(grid.best_score_)) return model, metrics
def train_lgb(model=False): global log params = grid_search_lgb(True) clf = LGBMClassifier().set_params(**params) if model: return clf params = clf.get_params() log += 'lgb' log += ', learning_rate: %.3f' % params['learning_rate'] log += ', n_estimators: %d' % params['n_estimators'] log += ', num_leaves: %d' % params['num_leaves'] log += ', min_split_gain: %.1f' % params['min_split_gain'] log += ', min_child_weight: %.4f' % params['min_child_weight'] log += ', min_child_samples: %d' % params['min_child_samples'] log += ', subsample: %.1f' % params['subsample'] log += ', colsample_bytree: %.1f' % params['colsample_bytree'] log += '\n\n' return train(clf)
# full_index = np.array([95,94,82,59,0]) # data_index = np.array([44,179,112,59,82,58,84]) # data_index = np.array([0]) data_index = np.array([0, 59, 94, 95, 84, 161, 44, 179, 82, 112, 58]) # classes = ['WWW', 'MAIL', 'FTP-CONTROL', 'FTP-PASV', 'ATTACK', 'P2P', 'DATABASE', 'FTP-DATA', 'MULTIMEDIA', 'SERVICES', # 'INTERACTIVE', 'GAMES'] classes = ['WWW', 'MAIL', 'FTP-CONTROL', 'FTP-PASV', 'ATTACK', 'P2P', 'DATABASE', 'FTP-DATA', 'MULTIMEDIA', 'SERVICES', 'INTERACTIVE'] # file used to train, who generates x_train,x_test,y_train,y_test # I also resampled the file `entry12` file = os.path.join(data_dir, filename) test_file = os.path.join(data_dir, test_filename) if __name__ == '__main__': acc = [] x_train, _, y_train, _ = get_data(file) _, x_test, _, y_test = get_data(test_file) np_dir = os.path.join(data_dir, 'estimators_100_150_5.txt') for i in range(50, 150, 5): clf = LGBMClassifier(n_estimators=i) clf.fit(x_train, y_train) print(clf.get_params()) accuracy = clf.score(x_test, y_test) acc.append(accuracy) acc = np.array(acc) print(acc) np.savetxt(np_dir, acc) import matplotlib.lines as lines lines.lineStyles
from lightgbm import LGBMClassifier classifier_lgbm_corr = LGBMClassifier(max_depth=500, learning_rate=0.01, num_leaves=1000, min_data_in_leaf=200, n_estimators=2000, objective='binary', metric='binary_logloss', random_state=42) #Parâmetros Utilizados pelo Modelo from pprint import pprint print('Parameters Currently In Use:\n') pprint(classifier_lgbm_corr.get_params()) # Fit e Predição import time start = time.time() classifier_lgbm_corr.fit(X_corr_train, Y_corr_train) end = time.time() print("Tempo de Execução: {} sec".format(end - start)) Y_pred_lgbm_corr = classifier_lgbm_corr.predict(X_corr_test) #Análise de Métricas
eval_metric='auc', # base_score = proportion_2j, n_jobs=cpu_n_jobs, random_state=42, silent=True) clf_org_lgb = LGBMClassifier(n_estimators=1000, learning_rate=0.1, objective='binary', n_jobs=cpu_n_jobs, random_state=42, silent=True) xgb_params = clf_org_xgb.get_xgb_params() lgb_params = clf_org_lgb.get_params() lgb_params.pop('n_estimators') lgb_params.pop('silent') xgb_cv_early_stopping = CV_EarlyStoppingTrigger( stopping_rounds=early_stopping_rounds, maximize_score=True, method='xgb') lgb_cv_early_stopping = CV_EarlyStoppingTrigger( stopping_rounds=early_stopping_rounds, maximize_score=True, method='lgb') from sklearn.model_selection import RandomizedSearchCV from sklearn.metrics import roc_auc_score # from sklearn.model_selection import StratifiedKFold import scipy.stats as sp_stats # from scipy.stats import norm as sp_normal
from lightgbm import LGBMClassifier #28° Teste - Random Search classifier_lgbm = LGBMClassifier( max_depth = 1880, learning_rate = 0.1, num_leaves = 100, n_estimators = 4500, min_data_in_leaf = 140, n_jobs = 4 ) #Parâmetros Utilizados pelo Modelo from pprint import pprint print('Parameters Currently In Use:\n') pprint(classifier_lgbm.get_params()) # Fit e Predição import time start = time.time() classifier_lgbm.fit(X, Y) end = time.time() print("Tempo de Execução: {:.2f} min".format((end - start)/60)) #Tempo de Execução: 11.01 min #Learning Curve - X, Y import matplotlib.pyplot as plt
def experiment(train=None, test=None, seed=None): """experiment func """ cv_name = now() cv_log_path = f'cv/LightGBM/{cv_name}/' Path(cv_log_path).mkdir(parents=True, exist_ok=True) log_fname = cv_log_path + 'cv.log' cv_logger = Logger('CV_log', log_fname) cv_logger.info("Experiment Start") with cv_logger.interval_timer('load data'): if train: train_df = load_feather(train) # train_df = train_df.sample(100000) else: fs = Path('preprocessed/features').glob('train_*.csv') # fs = ['preprocessed/features/train_nextClick.csv', # 'preprocessed/features/train_ip_app_nextClick.csv'] train_df = load_data(config.TRAIN_PATH, fs, cv_logger, dump='preprocessed/train.ftr') # offset = pd.to_datetime('2017-11-07 16:00:00') # train_df = train_df[train_df.click_time >= offset] gc.collect() if test: test_df = load_feather(test) else: fs = Path('preprocessed/features').glob('test_*.csv') # fs = ['preprocessed/features/test_nextClick.csv', # 'preprocessed/features/test_ip_app_nextClick.csv'] test_df = load_data(config.TEST_PATH, fs, cv_logger, dump='preprocessed/test.ftr') gc.collect() train_df = train_df.reset_index(drop=True) test_df = test_df.reset_index(drop=True) cv_logger.info(config.SEP_TIME) with cv_logger.interval_timer('split'): split_gen = enumerate(timeseries_cv(train_df, config.SEP_TIME)) # dump configuration aucs = [] # add ip_day_hour_nunique, app_device_channel_nextClick, # ip_os_device_nextClick, train_cols = [ 'app', 'app_channel_ce', 'channel', 'device', 'hour', 'ip_app_ce', 'ip_app_channel_hour_mean', 'ip_app_channel_nextClick', 'ip_app_device_os_channel_nextClick', 'ip_app_device_os_nextClick', 'ip_app_nextClick', 'ip_app_nunique', 'ip_app_os_ce', 'ip_app_os_nunique', 'ip_channel_nunique', 'ip_day_hour_ce', 'ip_day_nunique', 'ip_day_hour_nunique', 'ip_device_nunique', 'ip_device_os_app_cumcount', 'ip_nextClick', 'ip_os_device_nextClick', 'app_device_channel_nextClick', 'ip_os_cumcount', 'ip_os_device_app_nunique', 'os' ] # encode_list = config.ENCODE_LIST # threshold = config.TE_THR valid_time = [4, 5, 6, 9, 10, 11, 13, 14, 15] # public_time = [5, 6, 9, 10, 11, 13, 14, 15] train_df = proc_bf_cv(train_df) gc.collect() for num, (train_idx, valid_idx) in split_gen: cv_logger.kiritori() cv_logger.info(f"fold {num} start") with cv_logger.interval_timer('train test split'): cvtrain_df = train_df.loc[train_idx] valid_df = train_df.loc[valid_idx] valid_df2 = valid_df[valid_df.hour.isin(valid_time)] cv_logger.info(f'train size {cvtrain_df.shape}') cv_logger.info(f'valid size {valid_df2.shape}') # valid_df3 = valid_df[valid_df.hour == 4] # valid_df4 = valid_df[valid_df.hour.isin(public_time)] # with cv_logger.interval_timer('target encode'): # cvtrain_df, valid_df, tes = custom_encode(cvtrain_df, # valid_df, # encode_list, # threshold, # cv_logger) # cvtrain_df = proc_bf_cv(cvtrain_df) # valid_df = proc_bf_cv(valid_df) # train_cols += [c for c in cvtrain_df.columns if '_te' in c] cv_logger.info("LGBM Baseline validation") eval_names = ['valid_lb'] train_X, train_y = cvtrain_df[train_cols], cvtrain_df.is_attributed eval_set = [] with cv_logger.interval_timer('valid make'): for df in [valid_df2]: X, y = df[train_cols], df.is_attributed eval_set.append((X, y)) cv_logger.info(f'train size {train_X.shape}') cv_logger.info(f'valid size {eval_set[0][0].shape}') cv_logger.info(list(train_X.columns)) gc.collect() lgbm = LGBMClassifier(n_estimators=1000, learning_rate=0.1, num_leaves=31, max_depth=-1, min_child_samples=20, min_child_weight=5, max_bin=255, scale_pos_weight=200, colsample_bytree=0.3, subsample=0.6, subsample_freq=0, random_state=seed, n_jobs=24) cv_logger.info(lgbm.get_params()) lgbm.fit(train_X, train_y, eval_metric="auc", eval_set=eval_set, eval_names=eval_names, early_stopping_rounds=30, verbose=10) auc = lgbm.best_score_ aucs.append(auc) cv_logger.info(f"naive LGBM AUC : {auc}") cv_logger.info(pformat(lgbm.evals_result_)) cv_logger.info("feature importance") fi = dict(zip(train_X.columns, lgbm.feature_importances_)) cv_logger.info(pformat(fi)) cv_logger.info(f"fold {num} end") del train_df cv_logger.double_kiritori() cv_logger.info("Cross Validation Done") cv_logger.info("Naive LGBM") cv_logger.info(f"AUC {auc}") cv_logger.info("Predict") # with cv_logger.interval_timer('all target encode'): # for te in tes: # test_df = te.transform(test_df) test_df = proc_bf_cv(test_df) test_X = test_df[train_cols] pred = lgbm.predict_proba(test_X, num_iteration=lgbm.best_iteration_) test_df['is_attributed'] = pred[:, 1] test_df['click_id'] = test_df.click_id.astype('uint32') sub = test_sup_merge(test_df) sub[['click_id', 'is_attributed']].to_csv(f'sub/{cv_name}_{seed}.csv', index=False) cv_logger.info("Experiment Done")
def fit_model(X_train, y_train, X_val, y_val, train_cols, bayes, model_type="lgb"): if model_type == "lgb": from lightgbm import LGBMRegressor, LGBMClassifier import lightgbm as lgb if bayes != 0: params = { 'early_stopping_rounds': 10, #early stopping 'eval_set': [(X_val[train_cols], y_val), (X_train[train_cols], y_train)] } fit_params = { 'n_estimators': (100, 500), #number of trees. # 'num_leaves': (30, 50), 'learning_rate': (0.01, 0.05), 'num_boost_round': (3000, 3500), 'subsample': (0.7, 0.75), #part of the dataset used for training on each round 'reg_alpha': (0, 0.1), #reg alpha 'reg_lambda': (0, 0.1), #reg lambda 'early_stopping_rounds': (10, 11), #early stopping # 'min_data_in_leaf' : (1000, 1001), #important to prevent overfitting } estimator = LGBMClassifier(**params, eval_metric=metric) model = bayesian_opt(estimator, X_train[train_cols], y_train, X_val, y_val, fit_params, params, bayes) model = model.best_estimator_ else: params = { 'objective': "multiclass", 'boosting_type': "gbdt", #boosting algorithm: gbdt, dart, goss # 'n_estimators': 100, #number of trees. # 'num_leaves' : 64, #number of leaves, to control the complexity of the tree. Either set this parameter of max_depth 'learning_rate': 0.01, #learning rate 'num_boost_round': 100, #max number of iterations 'subsample': 0.7, #part of the dataset used for training on each round 'reg_alpha': 0.1, #reg alpha 'reg_lambda': 0.1, #reg lambda 'verbose_eval': 20, #verbose 'early_stopping_rounds': 10, #early stopping # 'min_data_in_leaf' : 1000, #important to prevent overfitting 'n_jobs': -1 } model = LGBMClassifier(**params, eval_metric=metric) model = model.fit(X=X_train[train_cols], y=y_train, eval_set=[(X_val[train_cols], y_val), (X_train[train_cols], y_train)]) for k, v in model.get_params(False).items(): if (type(v) == int) | (type(v) == float): neptune.log_text(k, str(v)) elif (type(v) == str): neptune.log_text(k, v) history = model elif model_type == "keras": import tensorflow as tf from tensorflow.keras.constraints import max_norm import tensorflow.keras.backend as K inp = tf.keras.layers.Input(shape=(len(train_cols), )) x = tf.keras.layers.Dense(10, activation='relu')(inp) x = tf.keras.layers.BatchNormalization()(x) # x = tf.keras.layers.Dropout(0.4)(x) x = tf.keras.layers.Dense(10, activation='relu')(x) # x = tf.keras.layers.BatchNormalization()(x) # x = tf.keras.layers.Dropout(0.4)(x) # x = tf.keras.layers.Dense(24, activation = 'relu')(x) # x = tf.keras.layers.BatchNormalization()(x) # x = tf.keras.layers.Dropout(0.4)(x) x = tf.keras.layers.Dense(51, activation='relu')(x) # x = tf.keras.layers.BatchNormalization()(x) # x = tf.keras.layers.Dropout(0.4)(x) out = tf.keras.layers.Dense(len(np.unique(y_train)), activation='softmax')(x) model = tf.keras.models.Model(inputs=inp, outputs=out) early_stopping = tf.keras.callbacks.EarlyStopping( monitor='val_loss', mode='auto', patience=10, restore_best_weights=True) reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', mode='auto', factor=0.5, patience=5) adam = tf.keras.optimizers.Adam(learning_rate=0.001) neptune.log_text("init_lr", str(K.eval(adam.lr))) model.compile(optimizer=adam, loss=metric, metrics=['accuracy']) history = model.fit(X_train[train_cols], pd.get_dummies(y_train), validation_data=(X_val[train_cols], pd.get_dummies(y_val)), epochs=100, batch_size=4, callbacks=[early_stopping, reduce_lr], verbose=1) neptune.log_text("epochs", str(history.params["epochs"])) neptune.log_text("steps", str(history.params["steps"])) neptune.log_text("early_stopping_rounds", str(K.eval(early_stopping.patience))) layers = [ str(type(l)).split()[1].split(">")[0].split(".")[-1][:-1] for l in model.layers ] neptune.log_text("Layers", "_".join(layers)) gc.collect() return [model, history]
class LGBMModel(Model): def __init__(self, config: Optional[Dict] = None): self.config = config model_cache_path = os.path.join(config["model_output_path"], "model.pkl") self.featurizer = LGBMFeaturizer( os.path.join(config['featurizer_output_path'], 'featurizer.pkl'), config) if "evaluate" in config and config["evaluate"] and not os.path.exists( model_cache_path): raise ValueError( "Non Existant Model output path in Evaluation Mode!") if model_cache_path and os.path.exists(model_cache_path): logger.info("Loading Model from Cache") with open(model_cache_path, "rb") as f: self.model = pickle.load(f) else: logger.info("Initializing Model from scratch....") self.model = LGBMClassifier(**self.config['params']) def train(self, train_datapoints: List[Datapoint], val_datapoints: List[Datapoint], cache_featurizer: Optional[bool] = False) -> None: self.featurizer.fit(train_datapoints) # caching(if True) we don't have go through more featurizing steps in features/LGBMFeaturizer if cache_featurizer: feature_names = self.featurizer.get_all_feature_names() with open( os.path.join(self.config['featurizer_output_path'], "feature_names.pkl"), "wb") as f: pickle.dump(feature_names, f) self.featurizer.save( os.path.join(self.config["featurizer_output_path"], "featurizer.pkl")) logger.info("Featurizing From Scratch") train_features = self.featurizer.featurizer( train_datapoints) #transform targets = [datapoint.target for datapoint in train_datapoints] self.model.fit(train_features, targets) def compute_metrics(self, eval_datapoints: List[Datapoint]) -> Dict: expected_labels = [datapoint.target for datapoint in eval_datapoints] predicted_proba = self.predict(eval_datapoints) predicted_labels = np.argmax(predicted_proba, axis=1) accuracy = accuracy_score(expected_labels, predicted_labels) f1 = f1_score(expected_labels, predicted_labels) auc = roc_auc_score(expected_labels, predicted_labels) confusion_matrix_ = confusion_matrix(expected_labels, predicted_labels) tn, fp, fn, tp = confusion_matrix_.ravel() return { "Accuracy": accuracy, "f1": f1, "AUC": auc, "True Negative": tn, "False Positive": fp, "False Negative": fn, "True Positive": tp } def predict(self, datapoints: List[Datapoint]) -> np.array: features = self.featurizer.featurizer(datapoints) return self.model.predict_proba(features) def get_params(self) -> Dict: return self.model.get_params() def save(self, model_cache_path: str) -> None: logger.info("Saving Model To Disk") with open(model_cache_path, "wb") as f: pickle.dump(self.model, f)
# learning_rate = 0.1, # num_leaves = 2000, # min_data_in_leaf = 200, # n_estimators = 2000 ) classifier_lgbm_smtk = LGBMClassifier( max_depth = 1880, learning_rate = 0.1, num_leaves = 100, n_estimators = 4500, min_data_in_leaf = 140 ) #Parâmetros Utilizados pelo Modelo from pprint import pprint print('Parameters Currently In Use:\n') pprint(classifier_lgbm_smtk.get_params()) #Fit e Predição import time start = time.time() classifier_lgbm_smtk.fit(X_train_smtk, Y_train_smtk) end = time.time() print("Tempo de Execução: {:.2f} min".format((end - start)/60)) Tempo de Execução: 34.12 min Y_pred_lgbm_smtk = classifier_lgbm_smtk.predict(X_test)
# saving hyperparameters and model cur_dir = os.getcwd() os.chdir('outputs/hyperparameters/') pickle.dump(params, open("hyperparameters.pkl", 'wb')) # hyperparameters pickle.dump(lgbm_cv_model, open("lightgbm_model.pkl", 'wb')) # model os.chdir(cur_dir) print("Best hyperparameters", params) # loading and prediction with model # del lgbm_cv_model cur_dir = os.getcwd() os.chdir('/Users/mvahit/Documents/GitHub/home_credit/outputs/hyperparameters/') model = pickle.load(open('lightgbm_model.pkl', 'rb')) os.chdir(cur_dir) model.predict(X_train.head()) # loading hyperparameters del model del params cur_dir = os.getcwd() os.chdir('/Users/mvahit/Documents/GitHub/home_credit/outputs/hyperparameters/') params = pickle.load(open('hyperparameters.pkl', 'rb')) final_lgbm = LGBMClassifier(**params).fit(X_train, y_train) final_lgbm.get_params() final_lgbm.predict(X_train.head())
classifier_lgbm_fvalue = LGBMClassifier(max_depth=500, learning_rate=0.01, num_leaves=1000, min_data_in_leaf=200, n_estimators=2000, objective='binary', metric='binary_logloss', random_state=42) #Parâmetros Utilizados pelo Modelo from pprint import pprint print('Parameters Currently In Use:\n') pprint(classifier_lgbm_fvalue.get_params()) #Bloco 03: Fit e Predição import time start = time.time() classifier_lgbm_fvalue.fit(X_train_fvalue, Y_train) end = time.time() print("Tempo de Execução: {} sec".format(end - start)) Y_pred_lgbm_fvalue = classifier_lgbm_fvalue.predict(X_test_fvalue) #Bloco 04: Análise de Métricas
from lightgbm import LGBMClassifier classifier_lgbm_chi2 = LGBMClassifier(max_depth=500, learning_rate=0.01, num_leaves=1000, min_data_in_leaf=200, n_estimators=2000, objective='binary', metric='binary_logloss', random_state=42) #Parâmetros Utilizados pelo Modelo from pprint import pprint print('Parameters Currently In Use:\n') pprint(classifier_lgbm_chi2.get_params()) #Bloco 03: Fit e Predição import time start = time.time() classifier_lgbm_chi2.fit(X_train_chi2, Y_train) end = time.time() print("Tempo de Execução: {} sec".format(end - start)) Y_pred_lgbm_chi2 = classifier_lgbm_chi2.predict(X_test_chi2) #Bloco 04: Análise de Métricas