Ejemplo n.º 1
0
def init_logger(cfg, search, type, best_arch=None):

    logger = Logger(
        cfg.results_dir,
        search=True,
        type=type,
    )

    logger.add("time", ":6.3f")
    # logger.add("data", ":6.3f")
    logger.add("loss", ":.4e")
    logger.add("acc1", ":6.2f")
    logger.add("acc5", ":6.2f")

    if best_arch:
        for key in best_arch:
            logger.set(key, best_arch[key])

    return logger
def train_val_pipeline(MODEL_NAME, dataset, params, net_params, dirs):
    t0 = time.time()
    per_epoch_time = []

    DATASET_NAME = dataset.name

    if MODEL_NAME in ['GCN', 'GAT']:
        if net_params['self_loop']:
            print(
                "[!] Adding graph self-loops for GCN/GAT models (central node trick)."
            )
            dataset._add_self_loops()

    if MODEL_NAME in ['GatedGCN']:
        if net_params['pos_enc']:
            print("[!] Adding graph positional encoding.")
            dataset._add_positional_encodings(net_params['pos_enc_dim'])
            print('Time PE:', time.time() - t0)

    trainset, valset, testset = dataset.train, dataset.val, dataset.test

    root_log_dir, root_ckpt_dir, write_file_name, write_config_file, mylog = dirs
    device = net_params['device']

    logger = Logger(os.path.join('out', mylog))
    logger.set_names([
        'Epoch', 'Train loss', 'Val. loss', 'Test loss', 'Train acc',
        'Val acc', 'Test acc'
    ])

    # Write the network and optimization hyper-parameters in folder config/
    with open(write_config_file + '.txt', 'w') as f:
        f.write(
            """Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n\nTotal Parameters: {}\n\n"""
            .format(DATASET_NAME, MODEL_NAME, params, net_params,
                    net_params['total_param']))

    log_dir = os.path.join(root_log_dir, "RUN_" + str(0))
    writer = SummaryWriter(log_dir=log_dir)

    # setting seeds
    random.seed(params['seed'])
    np.random.seed(params['seed'])
    torch.manual_seed(params['seed'])
    if device.type == 'cuda':
        torch.cuda.manual_seed(params['seed'])

    print("Training Graphs: ", len(trainset))
    print("Validation Graphs: ", len(valset))
    print("Test Graphs: ", len(testset))

    model = gnn_model(MODEL_NAME, net_params)
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=params['init_lr'],
                           weight_decay=params['weight_decay'])
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=params['lr_reduce_factor'],
        patience=params['lr_schedule_patience'],
        verbose=True)

    epoch_train_losses, epoch_val_losses = [], []
    epoch_train_MAEs, epoch_val_MAEs = [], []

    # batching exception for Diffpool
    drop_last = True if MODEL_NAME == 'DiffPool' else False

    if MODEL_NAME in ['RingGNN', '3WLGNN']:
        # import train functions specific for WLGNNs
        from train.train_molecules_graph_regression import train_epoch_dense as train_epoch, evaluate_network_dense as evaluate_network
        from functools import partial  # util function to pass edge_feat to collate function

        train_loader = DataLoader(trainset,
                                  shuffle=True,
                                  collate_fn=partial(
                                      dataset.collate_dense_gnn,
                                      edge_feat=net_params['edge_feat']))
        val_loader = DataLoader(valset,
                                shuffle=False,
                                collate_fn=partial(
                                    dataset.collate_dense_gnn,
                                    edge_feat=net_params['edge_feat']))
        test_loader = DataLoader(testset,
                                 shuffle=False,
                                 collate_fn=partial(
                                     dataset.collate_dense_gnn,
                                     edge_feat=net_params['edge_feat']))

    else:
        # import train functions for all other GNNs
        from train.train_molecules_graph_regression import train_epoch_sparse as train_epoch, evaluate_network_sparse as evaluate_network

        train_loader = DataLoader(trainset,
                                  batch_size=params['batch_size'],
                                  shuffle=True,
                                  drop_last=drop_last,
                                  collate_fn=dataset.collate)
        val_loader = DataLoader(valset,
                                batch_size=params['batch_size'],
                                shuffle=False,
                                drop_last=drop_last,
                                collate_fn=dataset.collate)
        test_loader = DataLoader(testset,
                                 batch_size=params['batch_size'],
                                 shuffle=False,
                                 drop_last=drop_last,
                                 collate_fn=dataset.collate)

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        with tqdm(range(params['epochs'])) as t:
            for epoch in t:

                t.set_description('Epoch %d' % epoch)

                start = time.time()

                if MODEL_NAME in [
                        'RingGNN', '3WLGNN'
                ]:  # since different batch training function for RingGNN
                    epoch_train_loss, epoch_train_mae, optimizer = train_epoch(
                        model, optimizer, device, train_loader, epoch,
                        params['batch_size'])
                else:  # for all other models common train function
                    epoch_train_loss, epoch_train_mae, optimizer = train_epoch(
                        model, optimizer, device, train_loader, epoch)

                epoch_val_loss, epoch_val_mae = evaluate_network(
                    model, device, val_loader, epoch)
                _, epoch_test_mae = evaluate_network(model, device,
                                                     test_loader, epoch)

                epoch_train_losses.append(epoch_train_loss)
                epoch_val_losses.append(epoch_val_loss)
                epoch_train_MAEs.append(epoch_train_mae)
                epoch_val_MAEs.append(epoch_val_mae)

                writer.add_scalar('train/_loss', epoch_train_loss, epoch)
                writer.add_scalar('val/_loss', epoch_val_loss, epoch)
                writer.add_scalar('train/_mae', epoch_train_mae, epoch)
                writer.add_scalar('val/_mae', epoch_val_mae, epoch)
                writer.add_scalar('test/_mae', epoch_test_mae, epoch)
                writer.add_scalar('learning_rate',
                                  optimizer.param_groups[0]['lr'], epoch)

                logger.append([
                    epoch, epoch_train_loss, epoch_val_loss, epoch_train_mae,
                    epoch_val_mae, epoch_test_mae,
                    optimizer.param_groups[0]['lr']
                ])

                t.set_postfix(time=time.time() - start,
                              lr=optimizer.param_groups[0]['lr'],
                              train_loss=epoch_train_loss,
                              val_loss=epoch_val_loss,
                              train_MAE=epoch_train_mae,
                              val_MAE=epoch_val_mae,
                              test_MAE=epoch_test_mae)

                per_epoch_time.append(time.time() - start)

                # Saving checkpoint
                ckpt_dir = os.path.join(root_ckpt_dir, "RUN_")
                if not os.path.exists(ckpt_dir):
                    os.makedirs(ckpt_dir)
                torch.save(model.state_dict(),
                           '{}.pkl'.format(ckpt_dir + "/epoch_" + str(epoch)))

                files = glob.glob(ckpt_dir + '/*.pkl')
                for file in files:
                    epoch_nb = file.split('_')[-1]
                    epoch_nb = int(epoch_nb.split('.')[0])
                    if epoch_nb < epoch - 1:
                        os.remove(file)

                scheduler.step(epoch_val_loss)

                if optimizer.param_groups[0]['lr'] < params['min_lr']:
                    print("\n!! LR EQUAL TO MIN LR SET.")
                    break

                # Stop training after params['max_time'] hours
                if time.time() - t0 > params['max_time'] * 3600:
                    print('-' * 89)
                    print(
                        "Max_time for training elapsed {:.2f} hours, so stopping"
                        .format(params['max_time']))
                    break

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early because of KeyboardInterrupt')

    _, test_mae = evaluate_network(model, device, test_loader, epoch)
    _, train_mae = evaluate_network(model, device, train_loader, epoch)
    print("Test MAE: {:.4f}".format(test_mae))
    print("Train MAE: {:.4f}".format(train_mae))
    print("Convergence Time (Epochs): {:.4f}".format(epoch))
    print("TOTAL TIME TAKEN: {:.4f}s".format(time.time() - t0))
    print("AVG TIME PER EPOCH: {:.4f}s".format(np.mean(per_epoch_time)))

    logger.add(['test_mae ', test_mae])
    logger.add(['train_mae ', train_mae])
    logger.add(['Converge Epochs ', epoch])

    logger.close()

    writer.close()
    """
        Write the results in out_dir/results folder
    """
    with open(write_file_name + '.txt', 'w') as f:
        f.write("""Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n
    FINAL RESULTS\nTEST MAE: {:.4f}\nTRAIN MAE: {:.4f}\n\n
    Convergence Time (Epochs): {:.4f}\nTotal Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n"""\
          .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'],
                  test_mae, train_mae, epoch, (time.time()-t0)/3600, np.mean(per_epoch_time)))