def main(): all_data = read_all(directory='../data/03_powertransform') dims = get_dims(all_data) dump(dims, '../data/07_dims/dims03.joblib') all_data = read_all(directory='../data/05_onehot') dims = get_dims(all_data) dump(dims, '../data/07_dims/dims05.joblib')
def main(): args = parse_args() seed_everything(args.seed) if args.onehot: all_data = read_all(directory='../data/05_onehot') sequences = read_sequences(directory='../data/06_onehot_seq') else: all_data = read_all(directory='../data/03_powertransform') sequences = read_sequences(directory='../data/04_sequence') dims = get_dims(all_data) loader_maker = LoaderMaker(all_data, sequences, args, onehot=args.onehot) # CV name = '15_cnn-onehot' if args.onehot else '15_cnn-label' skf = StratifiedKFold(n_splits=5) folds = skf.split(all_data['application_train']['SK_ID_CURR'], all_data['application_train']['TARGET']) best_models = [] for train_index, val_index in folds: train_dataloader = loader_maker.make(train_index) val_dataloader = loader_maker.make(val_index) model = LightningModel(R2NCNN(dims, args.n_hidden, args.n_main), nn.BCEWithLogitsLoss(), train_dataloader, val_dataloader, args) trainer = HomeCreditTrainer(name, args.n_epochs, args.patience) trainer.fit(model) best_model = load_model(model, name, trainer.logger.version) best_models.append(best_model) # Predict test_dataloader = loader_maker.make(index=None, train=False) df_submission = predict(best_models, test_dataloader) filename = '../submission/15_r2n-cnn-onehot.csv' if args.onehot else '../submission/15_r2n-cnn-label.csv' df_submission.to_csv(filename, index=False)
def main(): args = parse_args() feature_matrix = joblib.load('../data/02_featuretools/feature_matrix.joblib') app_train = feature_matrix.dropna(subset=['TARGET']) app_test = feature_matrix[feature_matrix['TARGET'].isnull()].drop('TARGET', axis=1) features = read_all('../data/21_dimlstm') for feature in features.values(): app_train = app_train.merge(feature) app_test = app_test.merge(feature) features = read_all('../data/22_vaelstm') for feature in features.values(): app_train = app_train.merge(feature) app_test = app_test.merge(feature) run_lgb(args, app_train, app_test, '36_ftdimvae')
def main(): args = parse_args() app_train = joblib.load( '../data/01_labelencoding/application_train.joblib') app_test = joblib.load('../data/01_labelencoding/application_test.joblib') features = read_all('../data/21_dimlstm') for feature in features.values(): app_train = app_train.merge(feature) app_test = app_test.merge(feature) features = read_all('../data/22_vaelstm') for feature in features.values(): app_train = app_train.merge(feature) app_test = app_test.merge(feature) run_lgb(args, app_train, app_test, '35_dimvae')
def main(): args = parse_args() seed_everything(args.seed) app_train = joblib.load('../data/05_onehot/application_train.joblib') app_test = joblib.load('../data/05_onehot/application_test.joblib') sequences = read_all('../data/06_onehot_seq/') dims = joblib.load('../data/07_dims/dims05.joblib') app_dims = {} app_dims['application_train'] = dims.pop('application_train') app_dims['application_test'] = dims.pop('application_test') app_data = {'application_train': app_train, 'application_test': app_test} loader_maker = LoaderMaker(app_data, sequences, args, onehot=True) skf = StratifiedKFold(n_splits=5) folds = skf.split(app_train['SK_ID_CURR'], app_train['TARGET']) best_models = [] for train_index, val_index in folds: encoders = pretrain(app_train, app_test, sequences, dims, train_index, val_index, args) train_dataloader = loader_maker.make(train_index) val_dataloader = loader_maker.make(val_index) model = LightningModel( PretrainedR2N(app_dims, args.n_hidden, args.n_main, encoders), nn.BCEWithLogitsLoss(), train_dataloader, val_dataloader, args) name = '82_vaelstm_fine' trainer = HomeCreditTrainer(name, args.n_epochs, args.patience) trainer.fit(model) best_model = load_model(model, name, trainer.logger.version) best_models.append(best_model) # Predict test_dataloader = loader_maker.make(index=None, train=False) df_submission = predict(best_models, test_dataloader) df_submission.to_csv(f'../submission/{name}.csv', index=False)
def main(): args = parse_args() app_train = joblib.load('../data/05_onehot/application_train.joblib') app_test = joblib.load('../data/05_onehot/application_test.joblib') sequences = read_all('../data/06_onehot_seq') dims = joblib.load('../data/07_dims/dims05.joblib') app_dims = {} app_dims['application_train'] = dims.pop('application_train') app_dims['application_test'] = dims.pop('application_test') encoders = {} for name, diminfo in dims.items(): model = VAELSTMModule(diminfo, args.n_hidden, None, None, args) model = load_model(model, name, logdir='../logs/22_vaelstm') encoder = model.model.encoder encoders[name] = encoder run_fine_tuning(args, app_dims, app_train, app_test, sequences, encoders, '42_vaelstm', onehot=True)
def main(): formatter = '%(asctime)s %(message)s' logging.basicConfig(filename='../logs/02_featuretools.log', level=logging.INFO, format=formatter) datas = read_all() app_train = datas['application_train'] app_test = datas['application_test'] bureau = datas['bureau'] bureau_balance = datas['bureau_balance'] cash = datas['POS_CASH_balance'] previous = datas['previous_application'] installments = datas['installments_payments'] credit = datas['credit_card_balance'] app_test["TARGET"] = np.nan app = app_train.append(app_test, ignore_index=True, sort=False) # Entity set with id applications entity_set = ft.EntitySet(id='HomeCredit') # Entities with a unique index entity_set = entity_set.entity_from_dataframe(entity_id='app', dataframe=app, index='SK_ID_CURR') entity_set = entity_set.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU') entity_set = entity_set.entity_from_dataframe(entity_id='previous', dataframe=previous, index='SK_ID_PREV') # Entities that do not have a unique index entity_set = entity_set.entity_from_dataframe( entity_id='bureau_balance', dataframe=bureau_balance, make_index=True, index='bureaubalance_index' ) entity_set = entity_set.entity_from_dataframe( entity_id='cash', dataframe=cash, make_index=True, index='cash_index' ) entity_set = entity_set.entity_from_dataframe( entity_id='installments', dataframe=installments, make_index=True, index='installments_index' ) entity_set = entity_set.entity_from_dataframe( entity_id='credit', dataframe=credit, make_index=True, index='credit_index' ) # Add in the defined relationships entity_set = entity_set.add_relationships([ ft.Relationship(entity_set['app']['SK_ID_CURR'], entity_set['bureau']['SK_ID_CURR']), ft.Relationship(entity_set['bureau']['SK_ID_BUREAU'], entity_set['bureau_balance']['SK_ID_BUREAU']), ft.Relationship(entity_set['app']['SK_ID_CURR'], entity_set['previous']['SK_ID_CURR']), ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['cash']['SK_ID_PREV']), ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['installments']['SK_ID_PREV']), ft.Relationship(entity_set['previous']['SK_ID_PREV'], entity_set['credit']['SK_ID_PREV']) ]) agg_primitives = ['sum', 'count', 'min', 'max', 'mean', 'mode'] feature_matrix, _ = ft.dfs( entityset=entity_set, target_entity='app', agg_primitives=agg_primitives, max_depth=2, features_only=False, verbose=True ) feature_matrix = feature_matrix.reset_index() dump(feature_matrix, '../data/02_featuretools/feature_matrix.joblib')
def main(): all_data = read_all(directory='../data/03_powertransform') app_train = all_data.pop('application_train') app_test = all_data.pop('application_test') data = app_train.append(app_test, sort=False) data = pd.get_dummies(data) app_train = data.dropna(subset=['TARGET']) app_test = data[data['TARGET'].isnull()].drop('TARGET', axis=1) dump(app_train, '../data/05_onehot/application_train.joblib') dump(app_test, '../data/05_onehot/application_test.joblib') with Pool(6) as pool: pool.map(process, list(all_data.items()))
def main(): args = parse_args() seed_everything(args.seed) app_train = joblib.load('../data/03_powertransform/application_train.joblib') app_test = joblib.load('../data/03_powertransform/application_test.joblib') sequences = read_all('../data/04_sequence/') dims = joblib.load('../data/07_dims/dims03.joblib') app_dims = {} app_dims['application_train'] = dims.pop('application_train') app_dims['application_test'] = dims.pop('application_test') mlflow.set_tracking_uri('../logs/mlruns') mlflow.set_experiment('HomeCredit') run_name = '91_dimlstm' params = vars(args) df_submission = app_test[['SK_ID_CURR']].copy() skf = StratifiedKFold(n_splits=5) folds = skf.split(app_train['SK_ID_CURR'], app_train['TARGET']) for i, (train_index, val_index) in enumerate(folds): # Train Encoder encoders = pretrain(app_train, sequences, dims, train_index, val_index, args) # Train LightGBM Model app_encoding_train = predict(app_train, encoders, sequences, args) x = app_encoding_train.drop(['SK_ID_CURR', 'TARGET'], axis=1) y = app_encoding_train['TARGET'] x_train, y_train = x.iloc[train_index], y.iloc[train_index] x_valid, y_valid = x.iloc[val_index], y.iloc[val_index] train_set = lgb.Dataset(x_train, y_train) valid_set = lgb.Dataset(x_valid, y_valid) model = lgb.train(params, train_set, valid_sets=[valid_set]) y_pred = model.predict(x_valid) auc = roc_auc_score(y_valid, y_pred) with mlflow.start_run(run_name=run_name): mlflow.log_params(params) mlflow.log_metric('auc', auc) # Predict app_encoding_test = predict(app_test, encoders, sequences, args) x_test = app_encoding_test.drop('SK_ID_CURR', axis=1) y_pred = model.predict(x_test) df_submission[f'pred_{i}'] = y_pred df_submission = df_submission.set_index('SK_ID_CURR').mean(axis=1).reset_index() df_submission.columns = ['SK_ID_CURR', 'TARGET'] df_submission.to_csv(f'../submission/{run_name}.csv', index=False)
def main(): args = parse_args() app_train = joblib.load('../data/03_powertransform/application_train.joblib') app_test = joblib.load('../data/03_powertransform/application_test.joblib') sequences = read_all('../data/04_sequence') dims = joblib.load('../data/07_dims/dims03.joblib') app_dims = {} app_dims['application_train'] = dims.pop('application_train') app_dims['application_test'] = dims.pop('application_test') encoders = {} for name, diminfo in dims.items(): model = DIMLSTMModule(diminfo, args.n_hidden, None, None, args) model = load_model(model, name, logdir='../logs/21_dimlstm') encoder = model.encoder encoders[name] = encoder run_fine_tuning(args, app_dims, app_train, app_test, sequences, encoders, '41_dimlstm')
def main(): all_data = read_all('../data/05_onehot') app_train = all_data.pop('application_train') app_test = all_data.pop('application_test') df_sk_id_curr = pd.concat( [app_train[['SK_ID_CURR']], app_test[['SK_ID_CURR']]]) # df_sk_id_curr = df_sk_id_curr.head(100) bureau = all_data['bureau'] bureau_balance = all_data['bureau_balance'] all_data['bureau_balance'] = pd.merge( bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], bureau_balance, on='SK_ID_BUREAU') id_list = [df_sk_id_curr] * len(all_data) with Pool(6) as pool: pool.starmap(process, zip(id_list, list(all_data.items())))
drop_cols.add(c2) except TypeError: continue df.drop(drop_cols, axis=1, inplace=True) return df def process(item): name, df = item df = power_transform(df) df = fillna(df) df = drop_same_columns(df) dump(df, f'../data/03_powertransform/{name}.joblib') if __name__ == "__main__": datas = read_all() app_train = datas.pop('application_train') app_test = datas.pop('application_test') app = app_train.append(app_test, sort=False) app = power_transform(app) app = fillna(app) app = drop_same_columns(app) app_train = app.dropna(subset=['TARGET']) app_test = app[app['TARGET'].isnull()].drop('TARGET', axis=1) dump(app_train, '../data/03_powertransform/application_train.joblib') dump(app_test, '../data/03_powertransform/application_test.joblib') with Pool(6) as pool: pool.map(process, list(datas.items()))
def main(): args = parse_args() seed_everything(args.seed) app_train = joblib.load('../data/05_onehot/application_train.joblib') app_test = joblib.load('../data/05_onehot/application_test.joblib') sequences = read_all('../data/06_onehot_seq/') dims = joblib.load('../data/07_dims/dims05.joblib') dims.pop('application_train') dims.pop('application_test') for name, diminfo in dims.items(): sequence = sequences[name] train_loader = torch.utils.data.DataLoader( OneHotSequenceDataset(app_train, sequence), batch_size=args.batch_size, shuffle=True, num_workers=6, worker_init_fn=worker_init_fn) test_loader = torch.utils.data.DataLoader( OneHotSequenceDataset(app_test, sequence), batch_size=args.batch_size, shuffle=False, num_workers=6, worker_init_fn=worker_init_fn) model = VAELSTMModule(diminfo, args.n_hidden, train_loader, test_loader, args) logdir = '../logs/22_vaelstm' path = pathlib.Path(logdir) / name if not path.exists(): path.mkdir(parents=True) logger = TensorBoardLogger(logdir, name=name) early_stopping = EarlyStopping(patience=args.patience, monitor='val_loss', mode='min') filepath = pathlib.Path( logdir) / name / f'version_{logger.version}' / 'checkpoints' model_checkpoint = ModelCheckpoint(str(filepath), monitor='val_loss', mode='min') trainer = pl.Trainer(default_save_path=logdir, gpus=-1, max_epochs=args.n_epochs, early_stop_callback=early_stopping, logger=logger, row_log_interval=100, checkpoint_callback=model_checkpoint) trainer.fit(model) best_model = load_model(model, name, trainer.logger.version, logdir=logdir) train_loader_no_shuffle = torch.utils.data.DataLoader( OneHotSequenceDataset(app_train, sequence), batch_size=args.batch_size, shuffle=False, num_workers=6, worker_init_fn=worker_init_fn) df_train = predict(name, best_model, train_loader_no_shuffle) df_test = predict(name, best_model, test_loader) df_encoding = pd.concat([df_train, df_test]) dump(df_encoding, f'../data/22_vaelstm/{name}.joblib')