def get_loaders(self, db_name, encoders, batch_size, num_workers):
     db_info = get_db_info(db_name)
     max_nodes_per_graph = None
     _ = get_db_container(db_name)
     train_data, val_data, test_data = get_train_val_test_datasets(
         dataset_name=db_name,
         train_test_split='use_full_train',
         encoders=encoders)
     train_loader = get_dataloader(dataset=train_data,
                                   batch_size=batch_size,
                                   sampler_class_name='SequentialSampler',
                                   num_workers=num_workers,
                                   max_nodes_per_graph=max_nodes_per_graph)
     val_loader = get_dataloader(dataset=val_data,
                                 batch_size=batch_size,
                                 sampler_class_name='SequentialSampler',
                                 num_workers=num_workers,
                                 max_nodes_per_graph=max_nodes_per_graph)
     test_loader = get_dataloader(dataset=test_data,
                                  batch_size=batch_size,
                                  sampler_class_name='SequentialSampler',
                                  num_workers=num_workers,
                                  max_nodes_per_graph=max_nodes_per_graph)
     loaders = {
         'train': train_loader,
         'val': val_loader,
         'test': test_loader
     }
     return db_info, loaders
    def test_memorize_minibatch(self):
        for db_name in self.db_names:
            db_info = get_db_info(db_name)
            train_data, val_data, _ = get_train_val_test_datasets(
                dataset_name=db_name,
                train_test_split='use_full_train',
                encoders=dict(CATEGORICAL='CategoricalOrdinalEnc',
                              SCALAR='ScalarRobustScalerEnc',
                              DATETIME='DatetimeScalarEnc',
                              LATLONG='LatLongScalarEnc',
                              TEXT='TextSummaryScalarEnc'),
            )
            train_loader = get_dataloader(
                dataset=train_data,
                batch_size=256,
                sampler_class_name='SequentialSampler',
                num_workers=0,
                max_nodes_per_graph=False)

            writer = DummyWriter()
            model = GCN(writer,
                        db_info=db_info,
                        hidden_dim=256,
                        n_init_layers=3,
                        activation_class_name='SELU',
                        activation_class_kwargs={},
                        loss_class_kwargs={},
                        loss_class_name='CrossEntropyLoss',
                        p_dropout=0.0,
                        drop_whole_embeddings=True,
                        n_layers=3,
                        readout_class_name='AvgPooling',
                        readout_kwargs={})
            if torch.cuda.is_available():
                model.cuda()
                model.device = torch.device('cuda:0')
            else:
                model.device = torch.device('cpu')
            model.train()
            optimizer = AdamW(model.parameters(), lr=0.001, weight_decay=0.0)

            bdgl, features, label = next(iter(train_loader))
            recursive_to((bdgl, features, label), model.device)
            for _ in tqdm(range(200)):
                optimizer.zero_grad()
                output = model(bdgl, features)
                loss = model.loss_fxn(output, label)
                if loss < 1e-4:
                    break
                loss.backward()
                optimizer.step()
            else:
                tqdm.write(f'Loss: {loss}')
                self.fail("Didn't memorize minibatch")
Ejemplo n.º 3
0
    def test_datapoints_for_no_self_loops_and_nonnegative_edge_types(self):
        for db_name in dataset_names:
            for dataset in get_train_val_test_datasets(
                    dataset_name=db_name,
                    train_test_split='use_full_train',
                    encoders=dict(CATEGORICAL='CategoricalOrdinalEnc',
                                  SCALAR='ScalarRobustScalerEnc',
                                  DATETIME='DatetimeScalarEnc',
                                  LATLONG='LatLongScalarEnc',
                                  TEXT='TextSummaryScalarEnc'),
            ):
                for dp_id, (edge_list, node_types, edge_types, features,
                            label) in tqdm(dataset):
                    # Nodes don't have any self loops in the raw data
                    for edge in edge_list:
                        self.assertNotEqual(edge[0], edge[1])

                    # All edge types are nonnegative in the raw data
                    self.assertTrue(all(et >= 0 for et in edge_types))
Ejemplo n.º 4
0
 def test_train_val_and_test_splits_contain_different_datapoints(self):
     for train_test_split in [
             'use_full_train', 'xval0', 'xval1', 'xval2', 'xval3', 'xval4'
     ]:
         for db_name in dataset_names:
             train_data, val_data, test_data = get_train_val_test_datasets(
                 dataset_name=db_name,
                 train_test_split=train_test_split,
                 encoders=dict(CATEGORICAL='CategoricalOrdinalEnc',
                               SCALAR='ScalarRobustScalerEnc',
                               DATETIME='DatetimeScalarEnc',
                               LATLONG='LatLongScalarEnc',
                               TEXT='TextSummaryScalarEnc'),
             )
             self.assertEqual(
                 0,
                 len(
                     set(train_data.datapoint_ids).intersection(
                         val_data.datapoint_ids).intersection(
                             test_data.datapoint_ids)))
Ejemplo n.º 5
0
 def setUp(self):
     self.db_info = get_db_info(self.db_name)
     batch_size = 1
     num_workers = 0
     max_nodes_per_graph = 100000
     _ = get_db_container(self.db_name)
     train_data, val_data, test_data = get_train_val_test_datasets(
         dataset_name=self.db_name,
         train_test_split='use_full_train',
         encoders=dict(CATEGORICAL='CategoricalOrdinalEnc',
                       SCALAR='ScalarRobustScalerEnc',
                       DATETIME='DatetimeScalarEnc',
                       LATLONG='LatLongScalarEnc',
                       TEXT='TextSummaryScalarEnc'),
     )
     train_loader = get_dataloader(
         dataset=train_data,
         batch_size=batch_size,
         sampler_class_name='SequentialSampler',
         num_workers=num_workers,
         max_nodes_per_graph=max_nodes_per_graph)
     val_loader = get_dataloader(
         dataset=val_data,
         batch_size=batch_size,
         sampler_class_name='SequentialSampler',
         num_workers=num_workers,
         max_nodes_per_graph=max_nodes_per_graph)
     test_loader = get_dataloader(
         dataset=test_data,
         batch_size=batch_size,
         sampler_class_name='SequentialSampler',
         num_workers=num_workers,
         max_nodes_per_graph=max_nodes_per_graph)
     self.loaders = {
         'train': train_loader,
         'val': val_loader,
         'test': test_loader
     }
def train_non_deep_model(writer,
                         seed,
                         log_dir,
                         debug_network,
                         dataset_name,
                         train_test_split,
                         encoders,
                         train_fraction_to_use,
                         model_class_name,
                         model_kwargs,
                         num_boost_round,
                         early_stopping_patience,
                         num_workers=-1):
    train_data, val_data, orig_test_data = get_train_val_test_datasets(
        dataset_name=dataset_name,
        train_test_split=train_test_split,
        encoders=encoders,
        train_fraction_to_use=train_fraction_to_use)

    categorical_feature = [i[0] for i in train_data.cat_feat_origin_cards]
    feature_names = categorical_feature + train_data.cont_feat_origin
    train_data = lgb.Dataset(np.concatenate([
        t.numpy()
        for t in [train_data.cat_data, train_data.cont_data] if t is not None
    ],
                                            axis=1),
                             label=train_data.targets.numpy(),
                             feature_name=feature_names,
                             categorical_feature=categorical_feature)
    raw_val_data = np.concatenate([
        t.numpy()
        for t in [val_data.cat_data, val_data.cont_data] if t is not None
    ],
                                  axis=1)
    val_data = lgb.Dataset(raw_val_data,
                           label=val_data.targets.numpy(),
                           feature_name=feature_names,
                           categorical_feature=categorical_feature,
                           reference=train_data)
    test_labels = orig_test_data.targets.numpy().T
    test_data = np.concatenate([
        t.numpy() for t in [orig_test_data.cat_data, orig_test_data.cont_data]
        if t is not None
    ],
                               axis=1)

    try:
        # Train
        param = {
            'num_leaves': model_kwargs['num_leaves'],
            'min_data_in_leaf': model_kwargs['min_data_in_leaf'],
            'objective': 'binary',
            'n_jobs': num_workers,
            'metric': ['cross_entropy', 'binary_error', 'auc'],
            'first_metric_only': True,
        }
        bst = lgb.train(param,
                        train_data,
                        valid_sets=[val_data],
                        num_boost_round=num_boost_round,
                        early_stopping_rounds=early_stopping_patience)

        # Plot val metrics
        if bst.best_iteration < num_boost_round:
            Path(os.path.join(writer.log_dir, 'stopped_early.info')).touch()
        else:
            Path(os.path.join(writer.log_dir,
                              'finished_all_epochs.info')).touch()
        bst.save_model(os.path.join(writer.log_dir,
                                    'model_checkpoint_best_val_loss.lgb'),
                       num_iteration=bst.best_iteration)
        val_probs = bst.predict(raw_val_data, num_iteration=bst.best_iteration)
        val_probs = np.vstack([1 - val_probs, val_probs]).T
        plot_validation_info(writer, val_data.label, val_probs)
        writer.add_hparams(
            format_hparam_dict_for_tb(writer.train_kwargs), {
                'hparam/best_auroc':
                bst.best_score['valid_0']['auc'],
                'hparam/best_acc':
                100 * (1 - bst.best_score['valid_0']['binary_error']),
                'hparam/best_loss':
                bst.best_score['valid_0']['cross_entropy'],
                'hparam/best_iter':
                bst.best_iteration
            })

        # Save test metrics
        for checkpoint_id in ['best_loss', 'best_auroc', 'best_acc']:
            test_probs = bst.predict(test_data,
                                     num_iteration=bst.best_iteration)
            test_probs = np.vstack([1 - test_probs, test_probs]).T
            results_dir = os.path.join(writer.log_dir, 'evaluations',
                                       f'model_checkpoint_{checkpoint_id}')
            os.makedirs(results_dir, exist_ok=True)
            results_file = os.path.join(results_dir, 'results.json')
            if train_test_split == 'use_full_train':
                # Write kaggle submission file
                test_probs = test_probs[:, 1]
                test_ids = orig_test_data.datapoint_ids
                predictions = pd.DataFrame({
                    'dp_id': test_ids,
                    'prob': test_probs
                })
                prediction_file = os.path.join(results_dir,
                                               'kaggle_submission.csv')
                write_kaggle_submission_file(dataset_name, predictions,
                                             prediction_file)
            else:
                results = {
                    'test_loss':
                    log_loss(test_labels, test_probs),
                    'test_accuracy':
                    100 *
                    accuracy_score(test_labels, test_probs.argmax(axis=1)),
                    'test_auroc':
                    roc_auc_score(test_labels, test_probs[:, 1])
                }
                with open(results_file, 'w') as f:
                    json.dump(results, f, indent=2)

    except Exception as e:
        Path(os.path.join(writer.log_dir, 'failed.info')).touch()
        writer.add_hparams(
            format_hparam_dict_for_tb(writer.train_kwargs), {
                'hparam/best_auroc': -1,
                'hparam/best_acc': -1,
                'hparam/best_loss': np.inf,
                'hparam/best_iter': -1
            })
        raise e
def train_model(writer,
                seed,
                log_dir,
                debug_network,
                dataset_name,
                train_test_split,
                encoders,
                max_nodes_per_graph,
                train_fraction_to_use,
                sampler_class_name,
                sampler_class_kwargs,
                model_class_name,
                model_kwargs,
                batch_size,
                epochs,
                optimizer_class_name,
                optimizer_kwargs,
                lr_scheduler_class_name,
                lr_scheduler_kwargs,
                early_stopping_patience,
                wd_bias,
                wd_embed,
                wd_bn,
                load_model_weights_from='',
                early_stopping_metric='loss',
                device='cpu',
                num_workers=0,
                find_lr=True):
    train_data, val_data, _ = get_train_val_test_datasets(
        dataset_name=dataset_name,
        train_test_split=train_test_split,
        encoders=encoders,
        train_fraction_to_use=train_fraction_to_use)
    train_loader = get_dataloader(dataset=train_data,
                                  batch_size=batch_size,
                                  sampler_class_name=sampler_class_name,
                                  sampler_class_kwargs=sampler_class_kwargs,
                                  num_workers=num_workers,
                                  max_nodes_per_graph=max_nodes_per_graph)
    print(f'Batches per train epoch: {len(train_loader)}')
    print(f'Total batches: {len(train_loader) * epochs}')
    val_loader = get_dataloader(dataset=val_data,
                                batch_size=batch_size,
                                sampler_class_name='SequentialSampler',
                                num_workers=num_workers,
                                max_nodes_per_graph=max_nodes_per_graph)

    def init_model():
        model_class = models.__dict__[model_class_name]
        if isinstance(train_data, TabularDataset):
            assert issubclass(model_class, TabModelBase)
            model_kwargs.update(
                n_cont_features=train_data.n_cont_features,
                cat_feat_origin_cards=train_data.cat_feat_origin_cards)
        elif isinstance(train_data, DatabaseDataset):
            assert issubclass(model_class, GNNModelBase)
            model_kwargs.update(feature_encoders=train_data.feature_encoders)
        else:
            raise ValueError
        model = model_class(writer=writer,
                            dataset_name=dataset_name,
                            **model_kwargs)
        if load_model_weights_from:
            state_dict = torch.load(load_model_weights_from,
                                    map_location=torch.device('cpu'))
            retval = model.load_state_dict(state_dict['model'], strict=False)
            print(f'Missing modules:\n{pprint.pformat(retval.missing_keys)}')
            print(
                f'Unexpected modules:\n{pprint.pformat(retval.unexpected_keys)}'
            )
        model_to_device(model, device)

        # If debugging, add hooks to all modules
        if debug_network:
            register_module_hooks('model', model, writer)

        return model

    # Optionally find good learning rate
    if find_lr:
        print('Finding good learning rate')
        model = init_model()
        optimizer = get_optim_with_correct_wd(optimizer_class_name, model,
                                              optimizer_kwargs, wd_bias,
                                              wd_embed, wd_bn)
        good_lr = get_good_lr(model,
                              optimizer,
                              train_loader,
                              init_value=1e-7,
                              final_value=1.0,
                              beta=0.98)
        optimizer_kwargs.update(lr=good_lr)
        writer.train_kwargs['optimizer_kwargs'].update(lr=good_lr)
        if lr_scheduler_class_name == 'CyclicLR':
            lr_scheduler_kwargs.update(max_lr=good_lr, base_lr=good_lr / 100)
            writer.train_kwargs['lr_scheduler_kwargs'].update(max_lr=good_lr,
                                                              base_lr=good_lr /
                                                              100)
        elif lr_scheduler_class_name == 'OneCycleLR':
            lr_scheduler_kwargs.update(max_lr=good_lr)
            writer.train_kwargs['lr_scheduler_kwargs'].update(max_lr=good_lr)

    model = init_model()
    optimizer = get_optim_with_correct_wd(optimizer_class_name, model,
                                          optimizer_kwargs, wd_bias, wd_embed,
                                          wd_bn)
    scheduler = opt.lr_scheduler.__dict__[lr_scheduler_class_name](
        optimizer, **lr_scheduler_kwargs)

    # Run train loop with early stopping
    best_auroc = -1
    best_acc = -1
    best_loss = np.inf
    best_epoch = -1
    try:
        for epoch in tqdm(range(epochs)):
            print(f'Epoch: {epoch}')
            log_param_values(writer, model)
            if epoch % 20 == 0:
                save_model_checkpoint(writer, epoch, model, optimizer,
                                      scheduler)
            val_auroc, val_acc, val_loss = validate_model(
                writer, val_loader, model, epoch)
            best = False
            if val_auroc is not None and val_auroc > best_auroc:
                best_auroc = val_auroc
                save_model_checkpoint(writer,
                                      epoch,
                                      model,
                                      optimizer,
                                      scheduler,
                                      chkpt_name='best_auroc')
                if early_stopping_metric == 'auroc':
                    best = True
            if val_acc is not None and val_acc > best_acc:
                best_acc = val_acc
                save_model_checkpoint(writer,
                                      epoch,
                                      model,
                                      optimizer,
                                      scheduler,
                                      chkpt_name='best_acc')
                if early_stopping_metric == 'acc':
                    best = True
            if val_loss < best_loss:
                best_loss = val_loss
                save_model_checkpoint(writer,
                                      epoch,
                                      model,
                                      optimizer,
                                      scheduler,
                                      chkpt_name='best_loss')
                if early_stopping_metric == 'loss':
                    best = True
            if early_stopping_metric == 'auroc':
                m = val_auroc
            elif early_stopping_metric == 'acc':
                m = val_acc
            elif early_stopping_metric == 'loss':
                m = -1 * val_loss
            if isinstance(scheduler, ReduceLROnPlateau):
                scheduler.step(m)
            if best:
                best_epoch = epoch
            if epoch - best_epoch >= early_stopping_patience:
                Path(os.path.join(writer.log_dir,
                                  'stopped_early.info')).touch()
                break
            train_epoch(writer, train_loader, model, optimizer, scheduler,
                        epoch)
            if hasattr(model, 'prune'):
                model.prune(epoch, m)
        else:
            save_model_checkpoint(writer, epoch, model, optimizer, scheduler)
            validate_model(writer, val_loader, model, epoch)
            Path(os.path.join(writer.log_dir,
                              'finished_all_epochs.info')).touch()
        writer.add_hparams(
            format_hparam_dict_for_tb(writer.train_kwargs), {
                'hparam/best_auroc': best_auroc,
                'hparam/best_acc': best_acc,
                'hparam/best_loss': best_loss,
                'hparam/best_epoch': best_epoch
            })
    except Exception as e:
        Path(os.path.join(writer.log_dir, 'failed.info')).touch()
        writer.add_hparams(
            format_hparam_dict_for_tb(writer.train_kwargs), {
                'hparam/best_auroc': best_auroc,
                'hparam/best_acc': best_acc,
                'hparam/best_loss': best_loss,
                'hparam/best_epoch': best_epoch
            })
        raise e
def start_evaluating(do_evaluate, do_dump_activations, module_acts_to_dump,
                     model_logdir, checkpoint_id, device, num_workers):
    with open(os.path.join(model_logdir, 'train_kwargs.json')) as f:
        train_kwargs = json.load(f)
    ds_name = train_kwargs['dataset_name']
    encoders = train_kwargs['encoders']
    train_data, _, test_data = get_train_val_test_datasets(
        dataset_name=ds_name,
        train_test_split=train_kwargs['train_test_split'],
        encoders=encoders,
        train_fraction_to_use=train_kwargs.get('train_fraction_to_use', 1.0))
    test_loader = get_dataloader(
        dataset=test_data,
        batch_size=train_kwargs['batch_size'],
        sampler_class_name='SequentialSampler',
        num_workers=num_workers,
        max_nodes_per_graph=train_kwargs['max_nodes_per_graph'])
    writer = DummyWriter()
    model_class = models.__dict__[train_kwargs['model_class_name']]
    if isinstance(train_data, TabularDataset):
        assert issubclass(model_class, TabModelBase)
        train_kwargs['model_kwargs'].update(
            n_cont_features=train_data.n_cont_features,
            cat_feat_origin_cards=train_data.cat_feat_origin_cards)
    elif isinstance(train_data, DatabaseDataset):
        assert issubclass(model_class, GNNModelBase)
        train_kwargs['model_kwargs'].update(
            feature_encoders=train_data.feature_encoders)
    else:
        raise ValueError
    model = model_class(writer=writer,
                        dataset_name=train_kwargs['dataset_name'],
                        **train_kwargs['model_kwargs'])
    if 'best' in checkpoint_id:
        checkpoint_path = [
            f for f in os.listdir(model_logdir) if checkpoint_id in f
        ]
        assert len(checkpoint_path) == 1, 'Wrong number of best checkpoints'
        checkpoint_path = os.path.join(model_logdir, checkpoint_path[0])
    else:
        checkpoint_path = os.path.join(model_logdir,
                                       f'model_checkpoint_{checkpoint_id}.pt')
    if torch.cuda.is_available() and 'cuda' in device:
        state_dict = torch.load(checkpoint_path,
                                map_location=torch.device(device))
    else:
        state_dict = torch.load(checkpoint_path,
                                map_location=torch.device('cpu'))
    model.load_state_dict(state_dict['model'])
    model_to_device(model, device)

    results_dir = os.path.join(model_logdir, 'evaluations',
                               f'model_checkpoint_{checkpoint_id}')
    os.makedirs(results_dir, exist_ok=True)

    if do_evaluate:
        evaluate_model(test_loader, train_kwargs, results_dir, model)

    if do_dump_activations:
        acts = dump_activations(ds_name, train_kwargs, train_data, encoders,
                                results_dir, model, module_acts_to_dump,
                                num_workers)
        return acts
from data.utils import get_db_container

db_names = ('acquirevaluedshopperschallenge', 'homecreditdefaultrisk',
            'kddcup2014')

if __name__ == '__main__':
    while True:
        inp = input('Re-extract dataset info? (y/n): ')
        if inp in ['y', 'n']:
            break

    for db_name in db_names:
        print(f'Doing {db_name}')
        _ = get_db_container(db_name)
        train_dataset, val_dataset, test_dataset = get_train_val_test_datasets(
            db_name, 'use_full_train')
        datasets = {
            'train': train_dataset,
            'val': val_dataset,
            'test': test_dataset
        }
        df_graph_info_path = f'./experiments/{db_name}_df_graph_info.pkl'
        df_node_info_path = f'./experiments/{db_name}_df_node_info.pkl'

        if inp == 'y':
            n_nodes = []
            n_edges = []
            n_in_edges = []
            n_out_edges = []
            for split, dataset in datasets.items():
                for dp_id, (edge_list, node_types, edge_types, features,