Example #1
0
def main():
    args = parse_args()
    seed_everything(args.seed)
    app_train = joblib.load('../data/05_onehot/application_train.joblib')
    app_test = joblib.load('../data/05_onehot/application_test.joblib')
    sequences = read_all('../data/06_onehot_seq/')
    dims = joblib.load('../data/07_dims/dims05.joblib')
    app_dims = {}
    app_dims['application_train'] = dims.pop('application_train')
    app_dims['application_test'] = dims.pop('application_test')

    app_data = {'application_train': app_train, 'application_test': app_test}
    loader_maker = LoaderMaker(app_data, sequences, args, onehot=True)

    skf = StratifiedKFold(n_splits=5)
    folds = skf.split(app_train['SK_ID_CURR'], app_train['TARGET'])
    best_models = []
    for train_index, val_index in folds:
        encoders = pretrain(app_train, app_test, sequences, dims, train_index,
                            val_index, args)
        train_dataloader = loader_maker.make(train_index)
        val_dataloader = loader_maker.make(val_index)
        model = LightningModel(
            PretrainedR2N(app_dims, args.n_hidden, args.n_main, encoders),
            nn.BCEWithLogitsLoss(), train_dataloader, val_dataloader, args)
        name = '82_vaelstm_fine'
        trainer = HomeCreditTrainer(name, args.n_epochs, args.patience)
        trainer.fit(model)
        best_model = load_model(model, name, trainer.logger.version)
        best_models.append(best_model)

    # Predict
    test_dataloader = loader_maker.make(index=None, train=False)
    df_submission = predict(best_models, test_dataloader)
    df_submission.to_csv(f'../submission/{name}.csv', index=False)
Example #2
0
def main():
    args = parse_args()
    seed_everything(args.seed)

    if args.onehot:
        all_data = read_all(directory='../data/05_onehot')
        sequences = read_sequences(directory='../data/06_onehot_seq')
    else:
        all_data = read_all(directory='../data/03_powertransform')
        sequences = read_sequences(directory='../data/04_sequence')
    dims = get_dims(all_data)
    loader_maker = LoaderMaker(all_data, sequences, args, onehot=args.onehot)

    # CV
    name = '15_cnn-onehot' if args.onehot else '15_cnn-label'
    skf = StratifiedKFold(n_splits=5)
    folds = skf.split(all_data['application_train']['SK_ID_CURR'],
                      all_data['application_train']['TARGET'])
    best_models = []
    for train_index, val_index in folds:
        train_dataloader = loader_maker.make(train_index)
        val_dataloader = loader_maker.make(val_index)
        model = LightningModel(R2NCNN(dims, args.n_hidden, args.n_main),
                               nn.BCEWithLogitsLoss(), train_dataloader,
                               val_dataloader, args)
        trainer = HomeCreditTrainer(name, args.n_epochs, args.patience)
        trainer.fit(model)
        best_model = load_model(model, name, trainer.logger.version)
        best_models.append(best_model)

    # Predict
    test_dataloader = loader_maker.make(index=None, train=False)
    df_submission = predict(best_models, test_dataloader)
    filename = '../submission/15_r2n-cnn-onehot.csv' if args.onehot else '../submission/15_r2n-cnn-label.csv'
    df_submission.to_csv(filename, index=False)
Example #3
0
def main():
    args = parse_args()
    app_train = joblib.load('../data/05_onehot/application_train.joblib')
    app_test = joblib.load('../data/05_onehot/application_test.joblib')
    sequences = read_all('../data/06_onehot_seq')
    dims = joblib.load('../data/07_dims/dims05.joblib')
    app_dims = {}
    app_dims['application_train'] = dims.pop('application_train')
    app_dims['application_test'] = dims.pop('application_test')

    encoders = {}
    for name, diminfo in dims.items():
        model = VAELSTMModule(diminfo, args.n_hidden, None, None, args)
        model = load_model(model, name, logdir='../logs/22_vaelstm')
        encoder = model.model.encoder
        encoders[name] = encoder

    run_fine_tuning(args,
                    app_dims,
                    app_train,
                    app_test,
                    sequences,
                    encoders,
                    '42_vaelstm',
                    onehot=True)
Example #4
0
def main():
    args = parse_args()
    seed_everything(args.seed)

    if args.onehot:
        app_train = joblib.load('../data/05_onehot/application_train.joblib')
        app_test = joblib.load('../data/05_onehot/application_test.joblib')
        dims = get_dims({'application_train': app_train})
        _, _, cont_dim = dims['application_train']
        n_input = cont_dim
    else:
        app_train = joblib.load(
            '../data/03_powertransform/application_train.joblib')
        app_test = joblib.load(
            '../data/03_powertransform/application_test.joblib')
        dims = get_dims({'application_train': app_train})
        cat_dims, emb_dims, cont_dim = dims['application_train']
        n_input = emb_dims.sum() + cont_dim

    n_hidden = args.n_hidden

    # CV
    skf = StratifiedKFold(n_splits=5)
    folds = skf.split(app_train['SK_ID_CURR'], app_train['TARGET'])
    best_models = []
    for train_index, val_index in folds:
        train_dataloader = make_dataloader(app_train,
                                           train_index,
                                           args.batch_size,
                                           onehot=args.onehot)
        val_dataloader = make_dataloader(app_train,
                                         val_index,
                                         args.batch_size,
                                         onehot=args.onehot)
        if args.onehot:
            network = MLPOneHot(n_input, n_hidden)
        else:
            network = MLP(cat_dims, emb_dims, n_input, n_hidden)
        model = LightningModel(network, nn.BCEWithLogitsLoss(),
                               train_dataloader, val_dataloader, args)
        name = '13_mlp-onehot' if args.onehot else '13_mlp-label'
        trainer = HomeCreditTrainer(name, args.n_epochs, args.patience)
        trainer.fit(model)

        best_model = load_model(model, name, trainer.logger.version)
        best_models.append(best_model)

    # Predict
    test_dataloader = make_dataloader(app_test,
                                      None,
                                      args.batch_size,
                                      train=False,
                                      onehot=args.onehot)
    df_submission = predict(best_models, test_dataloader)
    filename = '../submission/13_mlp-onehot.csv' if args.onehot else '../submission/13_mlp-label.csv'
    df_submission.to_csv(filename, index=False)
Example #5
0
def pretrain(app_train, app_test, sequences, dims, train_index, val_index,
             args):
    encoders = {}
    for name, diminfo in dims.items():
        cat = sequences[f'{name}_cat']
        cont = sequences[f'{name}_cont']
        train_loader = torch.utils.data.DataLoader(
            SequenceDataset(app_train,
                            cat,
                            cont,
                            index=train_index,
                            app_test=app_test),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=6,
            worker_init_fn=worker_init_fn)
        val_loader = torch.utils.data.DataLoader(SequenceDataset(
            app_train, cat, cont, index=val_index),
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 num_workers=6,
                                                 worker_init_fn=worker_init_fn)
        model = DIMLSTMModule(diminfo, args.n_hidden, train_loader, val_loader,
                              args)
        logdir = '../logs/81_dimlstm'
        path = pathlib.Path(logdir) / name
        if not path.exists():
            path.mkdir(parents=True)
        logger = TensorBoardLogger(logdir, name=name)
        early_stopping = EarlyStopping(patience=args.patience,
                                       monitor='val_loss_main',
                                       mode='min')
        filepath = pathlib.Path(
            logdir) / name / f'version_{logger.version}' / 'checkpoints'
        model_checkpoint = ModelCheckpoint(str(filepath),
                                           monitor='val_loss_main',
                                           mode='min')
        trainer = pl.Trainer(default_save_path=logdir,
                             gpus=-1,
                             max_epochs=args.n_epochs,
                             early_stop_callback=early_stopping,
                             logger=logger,
                             row_log_interval=100,
                             checkpoint_callback=model_checkpoint)
        trainer.fit(model)

        best_model = load_model(model,
                                name,
                                trainer.logger.version,
                                logdir=logdir)
        encoders[name] = best_model.encoder
    return encoders
Example #6
0
def main():
    args = parse_args()
    app_train = joblib.load('../data/03_powertransform/application_train.joblib')
    app_test = joblib.load('../data/03_powertransform/application_test.joblib')
    sequences = read_all('../data/04_sequence')
    dims = joblib.load('../data/07_dims/dims03.joblib')
    app_dims = {}
    app_dims['application_train'] = dims.pop('application_train')
    app_dims['application_test'] = dims.pop('application_test')

    encoders = {}
    for name, diminfo in dims.items():
        model = DIMLSTMModule(diminfo, args.n_hidden, None, None, args)
        model = load_model(model, name, logdir='../logs/21_dimlstm')
        encoder = model.encoder
        encoders[name] = encoder
    
    run_fine_tuning(args, app_dims, app_train, app_test, sequences, encoders, '41_dimlstm')
Example #7
0
def main():
    args = parse_args()
    seed_everything(args.seed)
    app_train = joblib.load(
        '../data/03_powertransform/application_train.joblib')
    app_test = joblib.load('../data/03_powertransform/application_test.joblib')
    sequences = read_sequences('../data/04_sequence/')
    dims = joblib.load('../data/07_dims/dims03.joblib')
    dims.pop('application_train')
    dims.pop('application_test')

    for name, diminfo in dims.items():
        cat = sequences[f'{name}_cat']
        cont = sequences[f'{name}_cont']
        train_loader = torch.utils.data.DataLoader(
            SequenceDataset(app_train, cat, cont),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=6,
            worker_init_fn=worker_init_fn)
        test_loader = torch.utils.data.DataLoader(
            SequenceDataset(app_test, cat, cont),
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=6,
            worker_init_fn=worker_init_fn)
        model = DIMLSTMModule(diminfo, args.n_hidden, train_loader,
                              test_loader, args)
        logdir = '../logs/21_dimlstm'
        path = pathlib.Path(logdir) / name
        if not path.exists():
            path.mkdir(parents=True)
        logger = TensorBoardLogger(logdir, name=name)
        early_stopping = EarlyStopping(patience=args.patience,
                                       monitor='val_loss_main',
                                       mode='min')
        filepath = pathlib.Path(
            logdir) / name / f'version_{logger.version}' / 'checkpoints'
        model_checkpoint = ModelCheckpoint(str(filepath),
                                           monitor='val_loss_main',
                                           mode='min')
        trainer = pl.Trainer(default_save_path=logdir,
                             gpus=-1,
                             max_epochs=args.n_epochs,
                             early_stop_callback=early_stopping,
                             logger=logger,
                             row_log_interval=100,
                             checkpoint_callback=model_checkpoint)
        trainer.fit(model)

        best_model = load_model(model,
                                name,
                                trainer.logger.version,
                                logdir=logdir)
        train_loader_no_shuffle = torch.utils.data.DataLoader(
            SequenceDataset(app_train, cat, cont),
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=6,
            worker_init_fn=worker_init_fn)
        df_train = predict(name, best_model, train_loader_no_shuffle)
        df_test = predict(name, best_model, test_loader)
        df_encoding = pd.concat([df_train, df_test])
        dump(df_encoding, f'../data/21_dimlstm/{name}.joblib')