def main():
    args = get_args()
    dataset = get_dataset(args.dataset)
    data = dataset.data
    tb_writer = SummaryWriter()
    tb_writer.iteration = 0

    device = torch.device("cuda:" +
                          str(args.device)) if torch.cuda.is_available(
                          ) and args.device != 'cpu' else torch.device("cpu")
    model = get_model(dataset.data.num_features, dataset.num_classes)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    train_loader = DataLoader(
        [(x_, y_) for i, (x_, y_) in enumerate(zip(data.x, data.y))
         if data.train_mask[i]],
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=0,
        pin_memory=True,
    )

    if args.enable_clearml_logger:
        tags = [
            f'Dataset: {args.dataset}',
            f'Pruning method: {args.pruning_method}',
            f'Architecture: {args.gnn}',
        ]
        pruning_param_name = 'num_minhash_funcs' if args.pruning_method == 'minhash_lsh' else 'random_pruning_prob'
        pruning_param = args.num_minhash_funcs if args.pruning_method == 'minhash_lsh' else args.random_pruning_prob
        tags.append(f'{pruning_param_name}: {pruning_param}')
        clearml_logger = get_clearml_logger(project_name="GNN_pruning",
                                            task_name=get_time_str(),
                                            tags=tags)

    for epoch in range(1, args.epochs + 1):
        loss, acc, f1 = train(epoch, dataset, train_loader, model, device,
                              optimizer, tb_writer)
        print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Approx. Train: {f1:.4f}')

        train_acc, val_acc, test_acc = test(dataset, model, device)
        print(f'Train ACC: {train_acc:.4f}, Val ACC: {val_acc:.4f}, '
              f'Test ACC: {test_acc:.4f}')

        tb_writer.add_scalars('Accuracy', {
            'train': train_acc,
            'Validation': val_acc,
            'Test': test_acc
        }, epoch)
Ejemplo n.º 2
0
def register_logging_files(args):
    tb_writer = None
    best_results_file = None
    log_file = None
    if args.exps_dir is not None:
        exps_dir = Path(
            args.exps_dir
        ) / 'pyg_with_pruning' / args.dataset / args.pruning_method
        if args.pruning_method == 'random':
            exps_dir = exps_dir / str(args.random_pruning_prob)
        elif args.pruning_method == 'minhash_lsh':
            exps_dir = exps_dir / str(args.num_minhash_funcs)

        exps_dir = exps_dir / get_time_str()
        best_results_file = exps_dir / 'best_results.txt'
        log_file = exps_dir / r'log.log'
        tensorboard_dir = exps_dir / 'tensorboard'
        if not tensorboard_dir.exists():
            tensorboard_dir.mkdir(parents=True, exist_ok=True)

        tb_writer = SummaryWriter(log_dir=tensorboard_dir)
        tb_writer.iteration = 0

    register_logger(log_file=log_file, stdout=True)
    log_command()
    log_args_description(args)

    clearml_task = None

    if args.enable_clearml_logger:
        tags = [
            f'Dataset: {args.dataset}',
            f'Pruning method: {args.pruning_method}',
            f'Architecture: {args.gnn}',
        ]
        pruning_param_name = 'num_minhash_funcs' if 'minhash_lsh' in args.pruning_method else 'random_pruning_prob'
        pruning_param = args.num_minhash_funcs if 'minhash_lsh' in args.pruning_method else args.random_pruning_prob
        tags.append(f'{pruning_param_name}: {pruning_param}')

        if pruning_param_name == 'num_minhash_funcs':
            tags.append(f'Sparsity: {args.sparsity}')
            tags.append(f'Complement: {args.complement}')

        clearml_task = get_clearml_logger(
            f"GNN_{args.dataset}_{args.target}_{args.gnn}",
            task_name=get_time_str(),
            tags=tags)

    return tb_writer, best_results_file, log_file, clearml_task
Ejemplo n.º 3
0
def main(args):
    if args.generate_only:
        graph_dataset = rgd.generate_graphs_dataset(
            **vars(args), random=np.random.RandomState(0))
        with open(args.dataset_path, 'wb') as fp:
            pickle.dump(graph_dataset, fp, protocol=pickle.HIGHEST_PROTOCOL)

    else:
        with open(args.dataset_path, 'rb') as fp:
            graph_dataset = pickle.load(fp)

    tb_writer = None
    if args.enable_clearml_logger:
        tb_writer = SummaryWriter(log_dir=None)
        tags = [
            f'Pruning method: {args.pruning_method}',
            f'Architecture: {args.gnn}',
            f'dim_edges:{args.dim_edges}',
        ]
        pruning_param_name = 'num_minhash_funcs' if 'minhash_lsh' in args.pruning_method else 'random_pruning_prob'
        pruning_param = args.num_minhash_funcs if 'minhash_lsh' in args.pruning_method else args.random_pruning_prob
        tags.append(f'{pruning_param_name}: {pruning_param}')
        clearml_task = get_clearml_logger(
            project_name="GNN_synthetic_pruning_dimensionality",
            task_name=get_time_str(),
            tags=tags)

    print(f"{time.time() - start_time:.4f} start time")
    graph_dataset, prunning_ratio, best_train, best_test, avg_time_train, avg_time_test = \
        tst_classify_networkx_synthetic_tg(**vars(args), tb_writer=tb_writer, args=args, graph_dataset=graph_dataset)
    print(f"{time.time() - start_time:.4f} end time")

    experiment_logs = dict()
    experiment_logs = clearml_task.connect(experiment_logs)
    experiment_logs['time/train'] = avg_time_train
    experiment_logs['time/val'] = avg_time_test
    experiment_logs['keep edges'] = prunning_ratio
    experiment_logs['max train accuracy'] = best_train
    experiment_logs['max test accuracy'] = best_test
Ejemplo n.º 4
0
def main(args):
    vals = dict()
    csv_file = args.csv_file
    """
    Pruning with LSH
    """
    args.pruning_method = 'minhash_lsh_projection'

    tb_writer = None
    if args.enable_clearml_logger:
        tb_writer = SummaryWriter(log_dir=None)
        tags = [
            f'Pruning method: {args.pruning_method}',
            f'Architecture: {args.gnn}',
            f'dim_edges:{args.dim_edges}',
        ]
        pruning_param_name = 'num_minhash_funcs' if 'minhash_lsh' in args.pruning_method else 'random_pruning_prob'
        pruning_param = args.num_minhash_funcs if 'minhash_lsh' in args.pruning_method else args.random_pruning_prob
        tags.append(f'{pruning_param_name}: {pruning_param}')
        clearml_logger = get_clearml_logger(project_name="GNN_synthetic_pruning_dimensionality",

                                            task_name=get_time_str(),
                                            tags=tags)

    print(f"{time.time() - start_time:.4f} start time")
    graph_dataset, prunning_ratio, best_train, best_test, avg_time_train, avg_time_test = \
        tst_classify_networkx_synthetic_tg(**vars(args), tb_writer=tb_writer, args=args, graph_dataset=None)
    print(f"{time.time() - start_time:.4f} end time")

    vals['keep edges'] = prunning_ratio
    vals['minhash train'] = best_train
    vals['minhash test'] = best_test

    vals['minhash time train'] = avg_time_train
    vals['minhash time test'] = avg_time_test

    """
    Pruning with random
    """
    args.pruning_method = 'random'
    args.random_pruning_prob = prunning_ratio
    tb_writer = None
    if args.enable_clearml_logger:
        clearml_logger.close()
        tb_writer = SummaryWriter(log_dir=None)
        tags = [
            f'Pruning method: {args.pruning_method}',
            f'Architecture: {args.gnn}',
        ]
        pruning_param_name = 'num_minhash_funcs' if 'minhash_lsh' in args.pruning_method else 'random_pruning_prob'
        pruning_param = args.num_minhash_funcs if 'minhash_lsh' in args.pruning_method else args.random_pruning_prob

        tags.append(f'{pruning_param_name}: {pruning_param}')
        clearml_logger = get_clearml_logger(project_name="GNN_synthetic_pruning",
                                            task_name=get_time_str(),
                                            tags=tags)

    print(f"{time.time() - start_time:.4f} start time")
    graph_dataset, prunning_ratio, best_train, best_test, avg_time_train, avg_time_test = \
        tst_classify_networkx_synthetic_tg(**vars(args),
                                           tb_writer=tb_writer,
                                           args=args,
                                           graph_dataset=graph_dataset)
    print(f"{time.time() - start_time:.4f} end time")
    vals['random train'] = best_train
    vals['random test'] = best_test
    vals['random time train'] = avg_time_train
    vals['random time test'] = avg_time_test
    vals['architecture'] = args.gnn

    df = pd.read_csv(csv_file)
    df = df.append(vals, ignore_index=True)
    df.to_csv(csv_file, index=False)
Ejemplo n.º 5
0
def get_training_args():
    ppi_num_input_features = 50
    ppi_num_classes = 121
    parser = argparse.ArgumentParser()

    # Training related
    parser.add_argument("--num_of_epochs",
                        type=int,
                        help="number of training epochs",
                        default=200)
    parser.add_argument(
        "--patience_period",
        type=int,
        help="number of epochs with no improvement on val before terminating",
        default=100)
    parser.add_argument("--lr",
                        type=float,
                        help="model learning rate",
                        default=5e-3)
    parser.add_argument("--weight_decay",
                        type=float,
                        help="L2 regularization on model weights",
                        default=0)
    parser.add_argument(
        "--should_test",
        action='store_true',
        help='should test the model on the test dataset? (no by default)')
    parser.add_argument(
        "--force_cpu",
        action='store_true',
        help='use CPU if your GPU is too small (no by default)')
    parser.add_argument("--device", type=str, help='')

    # GAT configs
    # parser.add_argument("--num_of_layers", type=int, help='', default=3)
    parser.add_argument(
        '--gnn',
        type=str,
        default='gat',
        help='GNN gcn, or gcn-virtual (default: gcn)',
    )
    parser.add_argument("--num_of_layers",
                        type=list,
                        help='',
                        default=[4, 4, 6])
    parser.add_argument(
        "--num_features_per_layer",
        type=list,
        help='',
        default=[ppi_num_input_features, 256, 256, ppi_num_classes])
    parser.add_argument("--add_skip_connection",
                        type=bool,
                        help='',
                        default=True)
    parser.add_argument("--bias", type=bool, help='', default=True)
    parser.add_argument("--dropout", type=float, help='', default=0.0)
    parser.add_argument("--layer_type", help='', default=LayerType.IMP3)

    # Dataset related (note: we need the dataset name for metadata and related stuff, and not for picking the dataset)
    parser.add_argument("--dataset_name",
                        choices=['PPI'],
                        help='dataset to use for training',
                        default='PPI')
    parser.add_argument("--batch_size",
                        type=int,
                        help='number of graphs in a batch',
                        default=2)
    parser.add_argument("--ppi_load_test_only",
                        type=bool,
                        default=False,
                        help='')

    # Pruning specific params:
    parser.add_argument('--pruning_method',
                        type=str,
                        default='random',
                        choices=[
                            "minhash_lsh_thresholding",
                            "minhash_lsh_projection", "random"
                        ])
    parser.add_argument('--random_pruning_prob', type=float, default=.5)
    parser.add_argument('--num_minhash_funcs', type=int, default=1)
    parser.add_argument('--sparsity', type=int, default=25)
    parser.add_argument("--complement", action='store_true', help="")
    parser.add_argument("--quantization_step", type=int, default=1, help="")

    # Logging/debugging/checkpoint related (helps a lot with experimentation)
    parser.add_argument("--enable_tensorboard",
                        action='store_true',
                        help="enable tensorboard logging (no by default)")
    parser.add_argument(
        "--console_log_freq",
        type=int,
        help="log to output console (batch) freq (None for no logging)",
        default=1)
    parser.add_argument(
        "--checkpoint_freq",
        type=int,
        help="checkpoint model saving (epoch) freq (None for no logging)",
        default=5)
    parser.add_argument('--enable_clearml_logger',
                        default=False,
                        action='store_true',
                        help="Enable logging to ClearML server")
    args = parser.parse_args()

    # I'm leaving the hyperparam values as reported in the paper, but I experimented a bit and the comments suggest
    # how you can make GAT achieve an even higher micro-F1 or make it smaller
    gat_config = {
        # GNNs, contrary to CNNs, are often shallow (it ultimately depends on the graph properties)
        "num_of_layers":
        3,  # PPI has got 42% of nodes with all 0 features - that's why 3 layers are useful
        "num_heads_per_layer":
        [4, 4, 6
         ],  # other values may give even better results from the reported ones
        "num_features_per_layer":
        [ppi_num_input_features, 256, 256,
         ppi_num_classes],  # 64 would also give ~0.975 uF1!
        "add_skip_connection":
        True,  # skip connection is very important! (keep it otherwise micro-F1 is almost 0)
        "bias":
        True,  # bias doesn't matter that much
        "dropout":
        0.0,  # dropout hurts the performance (best to keep it at 0)
        "layer_type":
        LayerType.
        IMP3  # the only implementation that supports the inductive setting
    }

    for k, v in gat_config.items():
        setattr(args, k, v)

    # Wrapping training configuration into a dictionary
    # training_config = dict()
    # for arg in vars(args):
    #     training_config[arg] = getattr(args, arg)
    tb_writer = None
    clearml_logger = None
    if args.enable_clearml_logger:
        args.enable_tensorboard = True
        tb_writer = SummaryWriter()
        tags = [
            f'Dataset: {args.dataset_name}',
            f'Pruning method: {args.pruning_method}',
            f'Architecture: {args.gnn}',
        ]
        pruning_param_name = 'num_minhash_funcs' if 'minhash_lsh' in args.pruning_method else 'random_pruning_prob'
        pruning_param = args.num_minhash_funcs if 'minhash_lsh' in args.pruning_method else args.random_pruning_prob
        tags.append(f'{pruning_param_name}: {pruning_param}')

        if pruning_param_name == 'num_minhash_funcs':
            tags.append(f'Sparsity: {args.sparsity}')
            tags.append(f'Complement: {args.complement}')

        clearml_logger = get_clearml_logger(project_name=f"GNN_PPI_{args.gnn}",
                                            task_name=get_time_str(),
                                            tags=tags)

    return args, tb_writer, clearml_logger
Ejemplo n.º 6
0
def main():
    args = get_args()
    dataset = get_dataset(args.dataset)
    data = dataset.data
    tb_writer = SummaryWriter()
    tb_writer.iteration = 0

    device = torch.device("cuda:" +
                          str(args.device)) if torch.cuda.is_available(
                          ) and args.device != 'cpu' else torch.device("cpu")
    model = get_model(dataset.data.num_features, dataset.num_classes, args.gnn)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    old_edge_count = data.edge_index.shape[1]

    # Pass the whole graph to the pruning mechanism. Consider it as one sample
    pruning_params, prunning_ratio = prune_dataset(
        [data], args, random=np.random.RandomState(0), pruning_params=None)

    edge_count = data.edge_index.shape[1]
    print(
        f"Old number of edges: {old_edge_count}. New one: {edge_count}. Change: {(old_edge_count - edge_count) / old_edge_count * 100}\%"
    )

    train_loader = NeighborSampler(
        data.edge_index,
        node_idx=data.train_mask,
        # sizes=[-1, -1],  # citations
        sizes=[-1, 10],  # reddit
        # sizes=[25, 10],  # default
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=12)
    subgraph_loader = NeighborSampler(data.edge_index,
                                      node_idx=None,
                                      sizes=[-1],
                                      batch_size=1024,
                                      shuffle=False,
                                      num_workers=12)

    clearml_task = None
    if args.enable_clearml_logger:
        tags = [
            f'Dataset: {args.dataset}',
            f'Pruning method: {args.pruning_method}',
            f'Architecture: {args.gnn}',
        ]
        pruning_param_name = 'num_minhash_funcs' if 'minhash_lsh' in args.pruning_method else 'random_pruning_prob'
        pruning_param = args.num_minhash_funcs if 'minhash_lsh' in args.pruning_method else args.random_pruning_prob
        tags.append(f'{pruning_param_name}: {pruning_param}')

        if pruning_param_name == 'num_minhash_funcs':
            tags.append(f'Sparsity: {args.sparsity}')
            tags.append(f'Complement: {args.complement}')

        clearml_task = get_clearml_logger(
            project_name=f"GNN_{args.dataset}_{args.gnn}",
            task_name=get_time_str(),
            tags=tags)

    train_times = []
    val_times = []
    max_train_acc = 0
    max_val_acc = 0
    max_test_acc = 0
    for epoch in range(1, args.epochs + 1):
        loss, acc, f1, avg_time_train = train(epoch, dataset, train_loader,
                                              model, device, optimizer,
                                              tb_writer)
        train_times.append(avg_time_train)
        print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Approx. Train: {f1:.4f}')

        train_acc, val_acc, test_acc, avg_time_test = test(
            dataset, subgraph_loader, model, device)
        val_times.append(avg_time_test)
        print(f'Train ACC: {train_acc:.4f}, Val ACC: {val_acc:.4f}, '
              f'Test ACC: {test_acc:.4f}')

        tb_writer.add_scalars('Accuracy', {
            'train': train_acc,
            'Validation': val_acc,
            'Test': test_acc
        }, epoch)

        max_train_acc = max(max_train_acc, train_acc)
        max_val_acc = max(max_val_acc, val_acc)
        max_test_acc = max(max_test_acc, test_acc)

    tb_writer.add_scalar('time/train', np.mean(train_times))
    tb_writer.add_scalar('time/val', np.mean(val_times))
    experiment_logs = dict()
    experiment_logs = clearml_task.connect(experiment_logs)
    experiment_logs['time/train'] = np.mean(train_times)
    experiment_logs['time/val'] = np.mean(val_times)
    experiment_logs['keep edges'] = prunning_ratio
    experiment_logs['max train accuracy'] = max_train_acc
    experiment_logs['max val accuracy'] = max_val_acc
    experiment_logs['max test accuracy'] = max_test_acc