def main(): # ========================================= # === Settings # ========================================= # Get logger logger = get_logger(__name__) logger.info('Settings') # Get argument parser = argparse.ArgumentParser() parser.add_argument('--config', default='./configs/model_0.json') parser.add_argument("--debug", action="store_true") args = parser.parse_args() logger.info(f'config: {args.config}') logger.info(f'debug: {args.debug}') # Get config config = json.load(open(args.config)) config.update({'args': {'config': args.config, 'debug': args.debug}}) if config["model"]["name"] == "lightgbm": config["model"]["model_params"]["nthread"] = cpu_count() # Create a directory for model output model_no = pathlib.Path(args.config).stem model_output_dir = (pathlib.Path(config['dataset']['output_directory']) / model_no) if not model_output_dir.exists(): model_output_dir.mkdir() logger.info(f'model_output_dir: {str(model_output_dir)}') logger.debug(f'model_output_dir exists: {model_output_dir.exists()}') config.update({'model_output_dir': str(model_output_dir)}) # ========================================= # === Loading data # ========================================= logger.info('Loading data') # Get train and test input_dir = pathlib.Path(config['dataset']['input_directory']) train = pd.read_csv(input_dir / 'train.csv') test = pd.read_csv(input_dir / 'test.csv') # Get target values target_column = config['data_type']['target'] y_train = train[target_column].values # ========================================= # === Loading features # ========================================= logger.info('Loading features') # Get features x_train, x_test = load_features(config) feature_name = x_test.columns logger.debug(f'number of features: {len(feature_name)}') # ========================================= # === Adversarial Validation # ========================================= logger.info("adversarial validation") train_adv = x_train test_adv = x_test train_adv['target'] = 0 test_adv['target'] = 1 train_test_adv = pd.concat([train_adv, test_adv], axis=0, sort=False).reset_index(drop=True) target = train_test_adv['target'].values train_set, val_set = train_test_split(train_test_adv, test_size=0.33, random_state=71, shuffle=True) x_train_adv = train_set[feature_name] y_train_adv = train_set['target'] x_val_adv = val_set[feature_name] y_val_adv = val_set['target'] logger.debug(f'the number of train set: {len(x_train_adv)}') logger.debug(f'the number of valid set: {len(x_val_adv)}') train_lgb = lgb.Dataset(x_train_adv, label=y_train_adv) val_lgb = lgb.Dataset(x_val_adv, label=y_val_adv) lgb_model_params = config["adversarial_validation"]["lgb_model_params"] lgb_train_params = config["adversarial_validation"]["lgb_train_params"] clf = lgb.train(lgb_model_params, train_lgb, valid_sets=[train_lgb, val_lgb], valid_names=['train', 'valid'], **lgb_train_params) feature_imp = pd.DataFrame(sorted( zip(clf.feature_importance(importance_type='gain'), feature_name)), columns=['value', 'feature']) plt.figure(figsize=(20, 10)) sns.barplot(x='value', y='feature', data=feature_imp.sort_values(by='value', ascending=False).head(20)) plt.title('LightGBM Features') plt.tight_layout() plt.savefig(model_output_dir / "feature_importance_adv.png") config.update({ 'adversarial_validation_result': { 'score': clf.best_score, 'feature_importances': feature_imp.set_index("feature").sort_values( by="value", ascending=False).head(20).to_dict()["value"] } }) # ========================================= # === Train model and predict # ========================================= logger.info('Train model and predict') # Get features x_train, x_test = load_features(config) feature_name = x_test.columns logger.debug(f'number of features: {len(feature_name)}') # Get folds folds_ids = Fold( n_splits=config['cv']['n_splits'], shuffle=config['cv']['shuffle'], random_state=config['cv']['random_state']).get_stratifiedkfold( x_train, y_train) # Train and predict model_name = config['model']['name'] model_cls = model_map[model_name] params = config['model'] runner = Runner(model_cls, params, model_output_dir, f'Train_{model_cls.__name__}') oof_preds, evals_result = runner.train_cv(x_train, y_train, folds_ids) config.update(evals_result) test_preds = runner.predict_cv(x_test) # ========================================= # === Make submission file # ========================================= sub = create_submission(test, test_preds, target_column) sub.to_csv(model_output_dir / 'submission.csv', index=False, header=True) # ========================================= # === Save files # ========================================= save_path = model_output_dir / 'output.json' json_dump(config, save_path) pd.DataFrame(oof_preds, columns=["target"]).to_csv(model_output_dir / 'oof.csv', index=False, header=True)
def main(): # ========================================= # === Settings # ========================================= # Get logger logger = get_logger(__name__) logger.info('Settings') # Get argument parser = argparse.ArgumentParser() parser.add_argument('--config', default='./configs/model_1dcnn_0.json') parser.add_argument("--debug", action="store_true") args = parser.parse_args() logger.info(f'config: {args.config}') logger.info(f'debug: {args.debug}') # Get config config = json.load(open(args.config)) config.update({'args': {'config': args.config, 'debug': args.debug}}) if config["model"]["name"] == "lightgbm": config["model"]["model_params"]["nthread"] = cpu_count() # Create a directory for model output model_no = pathlib.Path(args.config).stem model_output_dir = (pathlib.Path(config['dataset']['output_directory']) / model_no) if not model_output_dir.exists(): model_output_dir.mkdir() logger.info(f'model_output_dir: {str(model_output_dir)}') logger.debug(f'model_output_dir exists: {model_output_dir.exists()}') config.update({'model_output_dir': str(model_output_dir)}) # ========================================= # === Loading data # ========================================= logger.info('Loading data') # Get train and test input_dir = pathlib.Path(config['dataset']['input_directory']) train = pd.read_csv(input_dir / 'train.csv') test = pd.read_csv(input_dir / 'test.csv') spectrum = pd.read_csv(input_dir / 'spectrum_stack.csv') spectrum_fitting = pd.read_csv(input_dir / 'spectrum_fitting_stack.csv') wv_cols = [f"wavelength_{i}" for i in range(512)] wv_fit_cols = [f"fitting_wavelength_{i}" for i in range(512)] train_spectrum = pd.merge(train, spectrum, on="spectrum_filename", how="left") test_spectrum = pd.merge(test, spectrum, on="spectrum_filename", how="left") train_spectrum = pd.merge(train_spectrum, spectrum_fitting, on="spectrum_filename", how="left") test_spectrum = pd.merge(test_spectrum, spectrum_fitting, on="spectrum_filename", how="left") train_std = np.std(train_spectrum[wv_cols].values, axis=1, keepdims=True) test_std = np.std(test_spectrum[wv_cols].values, axis=1, keepdims=True) train_spectrum[wv_cols] = train_spectrum[wv_cols].values / train_std test_spectrum[wv_cols] = test_spectrum[wv_cols].values / test_std spectrum_cols = wv_cols + wv_fit_cols train_spectrum = train_spectrum[spectrum_cols] test_spectrum = test_spectrum[spectrum_cols] # Get target values target_column = config['data_type']['target'] y_train = train[target_column].values # ========================================= # === Loading features # ========================================= logger.info('Loading features') # Get features x_train, x_test = load_features(config) feature_name = x_test.columns logger.debug(f'number of features: {len(feature_name)}') # ========================================= # === features preprocess # ========================================= x_total = x_train.append(x_test).reset_index(drop=True) remove_features = [c for c in x_total.columns if c.find("layout_x") != -1] remove_features += [c for c in x_total.columns if c.find("layout_y") != -1] x_total.drop(columns=remove_features, inplace=True) x_total = pd.get_dummies( x_total, columns=["LabelEncoding_exc_wl", "LabelEncoding_layout_a"]) x_total.fillna(0, inplace=True) from sklearn.preprocessing import StandardScaler numeric_features = [ c for c in x_total.columns if c.find("LabelEncoding_") == -1 ] sc = StandardScaler() x_total[numeric_features] = sc.fit_transform(x_total[numeric_features]) x_train = x_total.iloc[:len(train)] x_test = x_total.iloc[len(train):].reset_index(drop=True) x_train = pd.concat([x_train, train_spectrum], axis=1) x_test = pd.concat([x_test, test_spectrum], axis=1) logger.debug(f'number of features with spec in train: {x_train.shape}') logger.debug(f'number of features with spec in test: {x_test.shape}') # ========================================= # === Train model and predict # ========================================= logger.info('Train model and predict') # Get folds folds_ids = Fold( n_splits=config['cv']['n_splits'], shuffle=config['cv']['shuffle'], random_state=config['cv']['random_state']).get_stratifiedkfold( x_train, y_train) # Train and predict model_name = config['model']['name'] model_cls = model_map[model_name] params = config['model'] runner = Runner(model_cls, params, model_output_dir, f'Train_{model_cls.__name__}') oof_preds, evals_result = runner.train_cv(x_train, y_train, folds_ids) config.update(evals_result) test_preds = runner.predict_cv(x_test) # ========================================= # === Make submission file # ========================================= sub = create_submission(test, test_preds, target_column) sub.to_csv(model_output_dir / 'submission.csv', index=False, header=True) # ========================================= # === Save files # ========================================= save_path = model_output_dir / 'output.json' json_dump(config, save_path) pd.DataFrame(oof_preds, columns=["target"]).to_csv(model_output_dir / 'oof.csv', index=False, header=True)