Esempio n. 1
0
def create_data(contour, translation, img_path_0, img_path_1, osvos_model, k):
    """Returns a torch_geometric.data object.
    
    The data object consists of:
    * x: Node feature matrix of shape [num_nodes, num_node_features]. The feature 
         of each node are the concatenated OSVOS feature vectors of the current 
         and the next frame.
    * edge_index: Graph connectivity in COO format of shape (2, num_edges) and type torch.long
                  Each node should be connected to its K nearest neighbours
    * edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]
                 The feature of each edge is the inverse distance between the two nodes it connects
    * y, optianl: The target of each node is the displacement of the node between the current 
                  and the next frame

    Parameters
    ----------
    contour : ndarray 
        Array of shape (num_contour_points, 2) containing contour points
    translation : ndarray of shape (num_contour_points, 2)
        Array of shape (num_contour_points, 2) containing translations from current to next contour point
    img_path_0 : str
        Path to current image
    img_path_0 : str
        Path to next image
    osvos_model : torch.nn.Sequential
        OSVOS model from which feature vectors can be extracted
    k : int
        Number of neighbours to compute KNN graph of contour points

    Returns
    -------
    data : torch_geometric.data
        Data object containing consisting of x, edge_index, edge_attr, and y
    """

    contour = torch.from_numpy(contour)

    img_0 = np.moveaxis(imread(img_path_0), 2, 0).astype(np.float64)
    img_0 = np.expand_dims(img_0, axis=0)
    img_0 = torch.from_numpy(img_0)

    img_1 = np.moveaxis(imread(img_path_1), 2, 0).astype(np.float64)
    img_1 = np.expand_dims(img_1, axis=0)
    img_1 = torch.from_numpy(img_1)

    # x: Node feature matrix
    x_1 = get_OSVOS_feature_vectors(contour, img_0, osvos_model)
    x_2 = get_OSVOS_feature_vectors(contour, img_1, osvos_model)
    x = torch.cat((x_1, x_2), 1)

    # edge_index: Graph connectivity in COO format
    edge_index = knn_graph(contour, k)
    edge_index = to_undirected(edge_index)

    # edge_attr: Edge feature matrix
    edge_attr = get_edge_attribute(contour, edge_index)

    # Create data object
    if translation is None:
        data = Data(x=x,
                    edge_index=edge_index,
                    edge_attr=edge_attr,
                    contour=contour)
    else:
        # The target of each node is the displacement of the node between the current and the next frame
        y = torch.from_numpy(translation.astype(np.float64))
        data = Data(x=x,
                    edge_index=edge_index,
                    edge_attr=edge_attr,
                    y=y,
                    contour=contour)

    return data
def tree_decomposition(mol):
    r"""The tree decomposition algorithm of molecules from the
    `"Junction Tree Variational Autoencoder for Molecular Graph Generation"
    <https://arxiv.org/abs/1802.04364>`_ paper.
    Returns the graph connectivity of the junction tree, the assignment
    mapping of each atom to the clique in the junction tree, and the number
    of cliques.

    Args:
        mol (rdkit.Chem.Mol): A :obj:`rdkit` molecule.

    :rtype: (LongTensor, LongTensor, int)
    """

    if Chem is None:
        raise ImportError('Package `rdkit` could not be found.')

    # Cliques = rings and bonds.
    cliques = [list(x) for x in Chem.GetSymmSSSR(mol)]
    for bond in mol.GetBonds():
        if not bond.IsInRing():
            cliques.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])

    # Generate `atom2clique` mappings.
    atom2clique = [[] for i in range(mol.GetNumAtoms())]
    for c in range(len(cliques)):
        for atom in cliques[c]:
            atom2clique[atom].append(c)

    # Merge rings that share more than 2 atoms as they form bridged compounds.
    for c1 in range(len(cliques)):
        for atom in cliques[c1]:
            for c2 in atom2clique[atom]:
                if c1 >= c2 or len(cliques[c1]) <= 2 or len(cliques[c2]) <= 2:
                    continue
                if len(set(cliques[c1]) & set(cliques[c2])) > 2:
                    cliques[c1] = set(cliques[c1]) | set(cliques[c2])
                    cliques[c2] = []
    cliques = [c for c in cliques if len(c) > 0]

    # Update `atom2clique` mappings.
    atom2clique = [[] for i in range(mol.GetNumAtoms())]
    for c in range(len(cliques)):
        for atom in cliques[c]:
            atom2clique[atom].append(c)

    # Add singleton cliques in case there are more than 2 intersecting
    # cliques. We further compute the "initial" clique graph.
    edges = {}
    for atom in range(mol.GetNumAtoms()):
        cs = atom2clique[atom]
        if len(cs) <= 1:
            continue

        # Number of bond clusters that the atom lies in.
        bonds = [c for c in cs if len(cliques[c]) == 2]
        # Number of ring clusters that the atom lies in.
        rings = [c for c in cs if len(cliques[c]) > 4]

        if len(bonds) > 2 or (len(bonds) == 2 and len(cs) > 2):
            cliques.append([atom])
            c2 = len(cliques) - 1
            for c1 in cs:
                edges[(c1, c2)] = 1

        elif len(rings) > 2:
            cliques.append([atom])
            c2 = len(cliques) - 1
            for c1 in cs:
                edges[(c1, c2)] = 99

        else:
            for i in range(len(cs)):
                for j in range(i + 1, len(cs)):
                    c1, c2 = cs[i], cs[j]
                    count = len(set(cliques[c1]) & set(cliques[c2]))
                    edges[(c1, c2)] = min(count, edges.get((c1, c2), 99))

    if len(edges) > 0:
        edge_index_T, weight = zip(*edges.items())
        row, col = torch.tensor(edge_index_T).t()
        inv_weight = 100 - torch.tensor(weight)
        clique_graph = SparseTensor(row=row,
                                    col=col,
                                    value=inv_weight,
                                    sparse_sizes=(len(cliques), len(cliques)))
        junc_tree = minimum_spanning_tree(clique_graph.to_scipy('csr'))
        row, col, _ = SparseTensor.from_scipy(junc_tree).coo()
        edge_index = torch.stack([row, col], dim=0)
        edge_index = to_undirected(edge_index, num_nodes=len(cliques))
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long)

    rows = [[i] * len(atom2clique[i]) for i in range(mol.GetNumAtoms())]
    row = torch.tensor(list(chain.from_iterable(rows)))
    col = torch.tensor(list(chain.from_iterable(atom2clique)))
    atom2clique = torch.stack([row, col], dim=0)

    return edge_index, atom2clique, len(cliques)
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-arxiv')
    split_idx = dataset.get_idx_split()

    data = dataset[0]

    x = data.x.to(device)
    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    edge_index = data.edge_index.to(device)
    edge_index = to_undirected(edge_index, data.num_nodes)
    adj = SparseTensor(row=edge_index[0], col=edge_index[1])

    if args.use_sage:
        model = SAGE(data.x.size(-1), args.hidden_channels,
                     dataset.num_classes, args.num_layers,
                     args.dropout).to(device)
    else:
        model = GCN(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                    args.num_layers, args.dropout).to(device)

        # Pre-compute GCN normalization.
        adj = adj.set_diag()
        deg = adj.sum(dim=1).to(torch.float)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, x, adj, y_true, train_idx, optimizer)
            result = test(model, x, adj, y_true, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
Esempio n. 4
0
def main():

    args = ArgsInit().save_exp()

    if args.use_gpu:
        device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    else:
        device = torch.device('cpu')

    dataset = PygNodePropPredDataset(name=args.dataset)
    data = dataset[0]
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)
    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    edge_index = data.edge_index.to(device)
    edge_index = to_undirected(edge_index, data.num_nodes)

    if args.self_loop:
        edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0]

    sub_dir = 'SL_{}'.format(args.self_loop)

    args.in_channels = data.x.size(-1)
    args.num_tasks = dataset.num_classes

    logging.info('%s' % args)

    model = DeeperGCN(args).to(device)

    logging.info(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    results = {'highest_valid': 0,
               'final_train': 0,
               'final_test': 0,
               'highest_train': 0}

    start_time = time.time()

    for epoch in range(1, args.epochs + 1):

        # epoch_loss = train(model, x, edge_index, y_true, train_idx, optimizer)
        epoch_loss = train_flag(model, x, edge_index, y_true, train_idx, optimizer, device, args)

        logging.info('Epoch {}, training loss {:.4f}'.format(epoch, epoch_loss))
        model.print_params(epoch=epoch)

        result = test(model, x, edge_index, y_true, split_idx, evaluator)
        logging.info(result)
        train_accuracy, valid_accuracy, test_accuracy = result

        if train_accuracy > results['highest_train']:
            results['highest_train'] = train_accuracy

        if valid_accuracy > results['highest_valid']:
            results['highest_valid'] = valid_accuracy
            results['final_train'] = train_accuracy
            results['final_test'] = test_accuracy

            save_ckpt(model, optimizer,
                      round(epoch_loss, 4), epoch,
                      args.model_save_path,
                      sub_dir, name_post='valid_best')

    logging.info("%s" % results)

    end_time = time.time()
    total_time = end_time - start_time
    logging.info('Total time: {}'.format(time.strftime('%H:%M:%S', time.gmtime(total_time))))
Esempio n. 5
0
def step1_inductive(task,
                    name,
                    ego_size=128,
                    num_iter=1000,
                    log_steps=10000,
                    num_workers=16,
                    method='acl'):
    dataset = create_dataset(name=f'{task}-{name}')
    data = dataset[0]

    N = data.num_nodes
    edge_index = data.edge_index
    edge_index = to_undirected(edge_index)
    if hasattr(data, "edge_index_train"):
        edge_index_train = data.edge_index_train
        edge_index_train = to_undirected(edge_index_train)
    else:
        edge_index_train = edge_index
    adj = csr_matrix((np.ones(edge_index.shape[1]), edge_index), shape=(N, N))
    adj_train = csr_matrix(
        (np.ones(edge_index_train.shape[1]), edge_index_train), shape=(N, N))

    idx_split = dataset.get_idx_split()
    train_idx = idx_split["train"].cpu().numpy()
    valid_idx = idx_split["valid"].cpu().numpy()
    test_idx = idx_split["test"].cpu().numpy()

    global graphlocal
    global graphlocal_train
    graphlocal = GraphLocal.from_sparse_adjacency(adj)
    graphlocal_train = GraphLocal.from_sparse_adjacency(adj_train)
    print('graphlocal generated')

    with multiprocessing.Pool(num_workers) as pool:
        ego_graphs_train, conds_train = zip(
            *pool.imap(calc_inductive_train, [(i, log_steps, num_iter,
                                               ego_size, method)
                                              for i in train_idx],
                       chunksize=512))

    with multiprocessing.Pool(num_workers) as pool:
        ego_graphs_valid, conds_valid = zip(
            *pool.imap(calc_inductive, [(i, log_steps, num_iter, ego_size,
                                         method) for i in valid_idx],
                       chunksize=512))

    with multiprocessing.Pool(num_workers) as pool:
        ego_graphs_test, conds_test = zip(
            *pool.imap(calc_inductive, [(i, log_steps, num_iter, ego_size,
                                         method) for i in test_idx],
                       chunksize=512))

    ego_graphs = []
    conds = []
    ego_graphs.extend(ego_graphs_train)
    ego_graphs.extend(ego_graphs_valid)
    ego_graphs.extend(ego_graphs_test)
    conds.extend(conds_train)
    conds.extend(conds_valid)
    conds.extend(conds_test)

    if method == 'acl':
        np.save(f"data/{name}-lc-ego-graphs-{ego_size}.npy", ego_graphs)
        np.save(f"data/{name}-lc-conds-{ego_size}.npy", conds)
    else:
        np.save(f"data/{name}-lc-{method}-ego-graphs-{ego_size}.npy",
                ego_graphs)
        np.save(f"data/{name}-lc-{method}-conds-{ego_size}.npy", conds)
Esempio n. 6
0
def test_to_undirected():
    row = torch.tensor([0, 1, 1])
    col = torch.tensor([1, 0, 2])

    edge_index = to_undirected(torch.stack([row, col], dim=0))
    assert edge_index.tolist() == [[0, 1, 1, 2], [1, 0, 2, 1]]
Esempio n. 7
0
        args.data_appendix += '_mnph{}'.format(args.max_nodes_per_hop)

args.res_dir = os.path.join('results/{}{}'.format(args.dataset,
                                                  args.save_appendix))
print('Results will be saved in ' + args.res_dir)
if not os.path.exists(args.res_dir):
    os.makedirs(args.res_dir)

if "ogbl" in args.dataset:
    dataset = PygLinkPropPredDataset(name=args.dataset)
    data = dataset[0]
    split_edge = dataset.get_edge_split()

    if args.use_valedges_as_input:
        val_edge_index = split_edge['valid']['edge'].t()
        val_edge_index = to_undirected(val_edge_index)
        data.edge_index = torch.cat([data.edge_index, val_edge_index], dim=-1)
        val_edge_weight = torch.ones([val_edge_index.size(1), 1], dtype=int)
        data.edge_weight = torch.cat([data.edge_weight, val_edge_weight], 0)

else:
    if args.dataset_file is None:
        print("Dataset file required.")
        sys.exit()

    s, d, w = [], [], []
    with open(args.dataset_file, 'r') as f:
        for index, line in enumerate(f):
            t1, t2, t3 = line.strip().split(" ")
            s.append(t1)
            d.append(t2)
Esempio n. 8
0
def test_edges(data, cold_mask_node, val_ratio=0.05, test_ratio=0.1):
    r"""Splits the edges of a :obj:`torch_geometric.data.Data` object
    into positive and negative train/val/test edges, and adds attributes of
    `train_pos_edge_index`, `train_neg_adj_mask`, `val_pos_edge_index`,
    `val_neg_edge_index`, `test_pos_edge_index`, and `test_neg_edge_index`
    to :attr:`data`.

    Args:
        data (Data): The data object.
        val_ratio (float, optional): The ratio of positive validation
            edges. (default: :obj:`0.05`)
        test_ratio (float, optional): The ratio of positive test
            edges. (default: :obj:`0.1`)

    :rtype: :class:`torch_geometric.data.Data`
    """

    assert 'batch' not in data  # No batch-mode.
    device = data.x.device
    num_nodes = data.num_nodes
    row, col = data.edge_index
    # data.edge_index = None

    # Return upper triangular portion.
    mask = row < col
    row, col = row[mask], col[mask]

    # Select train nodes

    edge_row, edge_col = row, col
    # print(data.edge_index.size())
    data.cold_mask_node = cold_mask_node

    # Select validate nodes
    for ind, i in enumerate(cold_mask_node):
        index = (row == i).nonzero()
        if (ind == 0):
            indice = index
        else:
            indice = torch.cat((indice, index), 0)

    test_indice = indice.squeeze()
    # print(test_indice.size())
    a_r, a_c = row[test_indice], col[test_indice]
    data.test_pos_edge_index = torch.stack([a_r, a_c], dim=0)
    # print(data.test_pos_edge_index.size())

    edge_mask = torch.Tensor(edge_row.size(0)).type(torch.bool).to(
        data.x.device)
    # print(edge_mask.size())
    edge_mask[edge_mask < 1] = True
    # print(edge_mask.sum())
    edge_mask = edge_mask.scatter_(0, test_indice, False)
    # print(edge_mask.sum())
    # print(edge_row.s/=ize())
    edge_row = edge_row[edge_mask]
    edge_col = edge_col[edge_mask]
    # print(edge_row.size())
    data.total_edge_index = torch.stack((edge_row, edge_col), dim=0)
    # print(data.total_edge_index.size())
    # print(all_indice.size())

    # Negative edges.
    neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8)
    neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool)
    neg_adj_mask[row, col] = 0  #inverse adj made

    neg_row, neg_col = neg_adj_mask.nonzero().t()

    # test negative

    for ind, i in enumerate(cold_mask_node):
        index = (neg_row == i).nonzero()
        if (ind == 0):
            indice = index
        else:
            indice = torch.cat((indice, index), 0)
    neg_test_indice = indice.squeeze()

    # perm_test = random.sample(range(neg_test_indice.size(0)),
    #                      test_indice.size(0))

    # perm_test = torch.tensor(perm_test).to(torch.long)
    # neg_a_r, neg_a_c = neg_row[perm_val], neg_col[perm_val]
    # data.test_neg_edge_index = torch.stack([neg_a_r, neg_a_c], dim=0).to(device)

    neg_a_r, neg_a_c = neg_row[neg_test_indice], neg_col[neg_test_indice]
    data.test_neg_edge_index = torch.stack([neg_a_r, neg_a_c],
                                           dim=0).to(device)

    data.total_edge_index = to_undirected(data.total_edge_index)

    return data
Esempio n. 9
0
    else:
        dataset.graph['edge_index'] = to_sparse_tensor(
            dataset.graph['edge_index'], dataset.graph['edge_feat'],
            dataset.graph['num_nodes'])
        dataset.graph['node_feat'] = dataset.graph['edge_index'].mean(dim=1)
        dataset.graph['edge_index'].set_value_(None)
    dataset.graph['edge_feat'] = None

n = dataset.graph['num_nodes']
# infer the number of classes for non one-hot and one-hot labels
c = max(dataset.label.max().item() + 1, dataset.label.shape[1])
d = dataset.graph['node_feat'].shape[1]

# whether or not to symmetrize
if not args.directed and args.dataset != 'ogbn-proteins':
    dataset.graph['edge_index'] = to_undirected(dataset.graph['edge_index'])

dataset.graph['edge_index'], dataset.graph['node_feat'] = \
    dataset.graph['edge_index'].to(device), dataset.graph['node_feat'].to(device)

print(f"num nodes {n} | num classes {c} | num node feats {d}")

### Load method ###
model = parse_method(args, dataset, n, c, d, device)

# using rocauc as the eval function
if args.rocauc or args.dataset in ('yelp-chi', 'twitch-e', 'ogbn-proteins'):
    criterion = nn.BCEWithLogitsLoss()
    eval_func = eval_rocauc
else:
    criterion = nn.NLLLoss()
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser(description="OGBL-Citation2 (GraphSAINT)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--batch_size", type=int, default=16 * 1024)
    parser.add_argument("--walk_length", type=int, default=3)
    parser.add_argument("--lr", type=float, default=0.001)
    parser.add_argument("--epochs", type=int, default=200)
    parser.add_argument("--num_steps", type=int, default=100)
    parser.add_argument("--eval_steps", type=int, default=10)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-citation2")
    split_edge = dataset.get_edge_split()
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)

    loader = GraphSAINTRandomWalkSampler(
        data,
        batch_size=args.batch_size,
        walk_length=args.walk_length,
        num_steps=args.num_steps,
        sample_coverage=0,
        save_dir=dataset.processed_dir,
    )

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge["train"]["source_node"].numel())[:86596]
    split_edge["eval_train"] = {
        "source_node": split_edge["train"]["source_node"][idx],
        "target_node": split_edge["train"]["target_node"][idx],
        "target_node_neg": split_edge["valid"]["target_node_neg"],
    }

    model = GCN(
        data.x.size(-1),
        args.hidden_channels,
        args.hidden_channels,
        args.num_layers,
        args.dropout,
    ).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-citation2")
    logger = Logger(args.runs, args)

    run_idx = 0

    while run_idx < args.runs:
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        run_success = True
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, loader, optimizer, device)
            print(
                f"Run: {run_idx + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}"
            )
            if loss > 2.0:
                run_success = False
                logger.reset(run_idx)
                print("Learning failed. Rerun...")
                break

            if epoch > 49 and epoch % args.eval_steps == 0:
                result = test(
                    model,
                    predictor,
                    data,
                    split_edge,
                    evaluator,
                    batch_size=64 * 1024,
                    device=device,
                )
                logger.add_result(run_idx, result)

                train_mrr, valid_mrr, test_mrr = result
                print(f"Run: {run_idx + 1:02d}, "
                      f"Epoch: {epoch:02d}, "
                      f"Loss: {loss:.4f}, "
                      f"Train: {train_mrr:.4f}, "
                      f"Valid: {valid_mrr:.4f}, "
                      f"Test: {test_mrr:.4f}")

        print("GraphSAINT")
        if run_success:
            logger.print_statistics(run_idx)
            run_idx += 1

    print("GraphSAINT")
    logger.print_statistics()
Esempio n. 11
0
def main(args):

    date_time = datetime.now().strftime('%m-%d-%H:%M:%S')
    log_path = os.path.join(args.log_root, args.log_path, args.save_name,
                            date_time)

    load_func, subset = args.dataset.split('/')[0], args.dataset.split('/')[1]
    if load_func == 'WebKB':
        load_func = WebKB
        dataset = load_func(root=args.data_path, name=subset)
    elif load_func == 'WikipediaNetwork':
        load_func = WikipediaNetwork
        dataset = load_func(root=args.data_path, name=subset)
    elif load_func == 'WikiCS':
        load_func = WikiCS
        dataset = load_func(root=args.data_path)
    elif load_func == 'cora_ml':
        dataset = load_citation_link(
            root='../dataset/data/tmp/cora_ml/cora_ml.npz')
    elif load_func == 'citeseer':
        dataset = load_citation_link(
            root='../dataset/data/tmp/citeseer_npz/citeseer_npz.npz')
        #load telegram/synthetic here
    else:
        dataset = load_syn(args.data_path + args.dataset, None)

    if os.path.isdir(log_path) == False:
        os.makedirs(log_path)

    # load dataset
    if 'dataset' in locals():
        data = dataset[0]
        edge_index = data.edge_index
        #feature = dataset[0].x.data

    size = torch.max(edge_index).item() + 1
    # generate edge index dataset
    #if args.task == 2:
    #    datasets = generate_dataset_2class(edge_index, splits = 10, test_prob = args.drop_prob)
    #else:
    save_file = args.data_path + args.dataset + '/' + subset
    datasets = generate_dataset_3class(edge_index,
                                       size,
                                       save_file,
                                       splits=10,
                                       probs=args.split_prob,
                                       task=args.task,
                                       label_dim=args.num_class_link)

    if args.task != 2:
        results = np.zeros((10, 4))
    else:
        results = np.zeros((10, 4, 5))
    for i in range(10):
        log_str_full = ''
        edges = datasets[i]['graph']
        if args.to_undirected:
            edges = to_undirected(edges)

        ########################################
        # initialize model and load dataset
        ########################################
        #x = torch.ones(size).unsqueeze(-1).to(device)
        x = in_out_degree(edges, size).to(device)
        edges = edges.long().to(device)

        model = GCN_Link(x.size(-1),
                         args.num_class_link,
                         filter_num=args.num_filter,
                         dropout=args.dropout).to(device)
        #model = nn.DataParallel(graphmodel)
        opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2)

        y_train = datasets[i]['train']['label']
        y_val = datasets[i]['validate']['label']
        y_test = datasets[i]['test']['label']
        y_train = torch.from_numpy(y_train).long().to(device)
        y_val = torch.from_numpy(y_val).long().to(device)
        y_test = torch.from_numpy(y_test).long().to(device)

        train_index = torch.from_numpy(
            datasets[i]['train']['pairs']).to(device)
        val_index = torch.from_numpy(
            datasets[i]['validate']['pairs']).to(device)
        test_index = torch.from_numpy(datasets[i]['test']['pairs']).to(device)

        #################################
        # Train/Validation/Test
        #################################
        best_test_err = 1000.0
        early_stopping = 0
        for epoch in range(args.epochs):
            start_time = time.time()
            if early_stopping > 500:
                break
            ####################
            # Train
            ####################
            train_loss, train_acc = 0.0, 0.0
            model.train()
            out = model(x, edges, train_index)

            train_loss = F.nll_loss(out, y_train)
            pred_label = out.max(dim=1)[1]
            train_acc = acc(pred_label, y_train)

            opt.zero_grad()
            train_loss.backward()
            opt.step()
            outstrtrain = 'Train loss: %.6f, acc: %.3f' % (
                train_loss.detach().item(), train_acc)

            ####################
            # Validation
            ####################
            train_loss, train_acc = 0.0, 0.0
            model.eval()
            out = model(x, edges, val_index)

            test_loss = F.nll_loss(out, y_val)
            pred_label = out.max(dim=1)[1]
            test_acc = acc(pred_label, y_val)

            outstrval = ' Test loss: %.6f, acc: %.3f' % (
                test_loss.detach().item(), test_acc)
            duration = "--- %.4f seconds ---" % (time.time() - start_time)
            log_str = (
                "%d / %d epoch" %
                (epoch, args.epochs)) + outstrtrain + outstrval + duration
            #print(log_str)
            log_str_full += log_str + '\n'
            ####################
            # Save weights
            ####################
            save_perform = test_loss.detach().item()
            if save_perform <= best_test_err:
                early_stopping = 0
                best_test_err = save_perform
                torch.save(model.state_dict(),
                           log_path + '/model' + str(i) + '.t7')
            else:
                early_stopping += 1

        write_log(vars(args), log_path)
        torch.save(model.state_dict(),
                   log_path + '/model_latest' + str(i) + '.t7')
        if args.task != 2:
            ####################
            # Testing
            ####################
            model.load_state_dict(
                torch.load(log_path + '/model' + str(i) + '.t7'))
            model.eval()
            out = model(x, edges, val_index)[:, :2]
            pred_label = out.max(dim=1)[1]
            val_acc = acc(pred_label, y_val)

            out = model(x, edges, test_index)[:, :2]
            pred_label = out.max(dim=1)[1]
            test_acc = acc(pred_label, y_test)

            model.load_state_dict(
                torch.load(log_path + '/model_latest' + str(i) + '.t7'))
            model.eval()
            out = model(x, edges, val_index)[:, :2]
            pred_label = out.max(dim=1)[1]
            val_acc_latest = acc(pred_label, y_val)

            out = model(x, edges, test_index)[:, :2]
            pred_label = out.max(dim=1)[1]
            test_acc_latest = acc(pred_label, y_test)
            ####################
            # Save testing results
            ####################
            log_str = ('val_acc: {val_acc:.4f}, ' +
                       'test_acc: {test_acc:.4f}, ')
            log_str1 = log_str.format(val_acc=val_acc, test_acc=test_acc)
            log_str_full += log_str1

            log_str = ('val_acc_latest: {val_acc_latest:.4f}, ' +
                       'test_acc_latest: {test_acc_latest:.4f}, ')
            log_str2 = log_str.format(val_acc_latest=val_acc_latest,
                                      test_acc_latest=test_acc_latest)
            log_str_full += log_str2 + '\n'
            print(log_str1 + log_str2)

            results[i] = [val_acc, test_acc, val_acc_latest, test_acc_latest]
        else:
            model.load_state_dict(
                torch.load(log_path + '/model' + str(i) + '.t7'))
            model.eval()
            out_val = model(x, edges, val_index)
            out_test = model(x, edges, test_index)
            [[val_acc_full, val_acc, val_auc, val_f1_micro, val_f1_macro],
             [test_acc_full, test_acc, test_auc, test_f1_micro, test_f1_macro]
             ] = link_prediction_evaluation(out_val, out_test, y_val, y_test)

            model.load_state_dict(
                torch.load(log_path + '/model_latest' + str(i) + '.t7'))
            model.eval()
            out_val = model(x, edges, val_index)
            out_test = model(x, edges, test_index)
            [[
                val_acc_full_latest, val_acc_latest, val_auc_latest,
                val_f1_micro_latest, val_f1_macro_latest
            ],
             [
                 test_acc_full_latest, test_acc_latest, test_auc_latest,
                 test_f1_micro_latest, test_f1_macro_latest
             ]] = link_prediction_evaluation(out_val, out_test, y_val, y_test)
            ####################
            # Save testing results
            ####################
            log_str = (
                'val_acc_full:{val_acc_full:.4f}, val_acc: {val_acc:.4f}, Val_auc: {val_auc:.4f},'
                +
                'val_f1_micro: {val_f1_micro:.4f}, val_f1_macro: {val_f1_macro:.4f}, '
                +
                'test_acc_full:{test_acc_full:.4f}, test_acc: {test_acc:.4f}, '
                +
                'test_f1_micro: {test_f1_micro:.4f}, test_f1_macro: {test_f1_macro:.4f}'
            )
            log_str = log_str.format(val_acc_full=val_acc_full,
                                     val_acc=val_acc,
                                     val_auc=val_auc,
                                     val_f1_micro=val_f1_micro,
                                     val_f1_macro=val_f1_macro,
                                     test_acc_full=test_acc_full,
                                     test_acc=val_acc,
                                     test_f1_micro=val_f1_micro,
                                     test_f1_macro=val_f1_macro)
            log_str_full += log_str + '\n'
            print(log_str)

            log_str = (
                'val_acc_full_latest:{val_acc_full_latest:.4f}, val_acc_latest: {val_acc_latest:.4f}, Val_auc_latest: {val_auc_latest:.4f},'
                +
                'val_f1_micro_latest: {val_f1_micro_latest:.4f}, val_f1_macro_latest: {val_f1_macro_latest:.4f},'
                +
                'test_acc_full_latest:{test_acc_full_latest:.4f}, test_acc_latest: {test_acc_latest:.4f}, '
                +
                'test_f1_micro_latest: {test_f1_micro_latest:.4f}, test_f1_macro_latest: {test_f1_macro_latest:.4f}'
            )
            log_str = log_str.format(val_acc_full_latest=val_acc_full_latest,
                                     val_acc_latest=val_acc_latest,
                                     val_auc_latest=val_auc_latest,
                                     val_f1_micro_latest=test_f1_micro_latest,
                                     val_f1_macro_latest=val_f1_macro_latest,
                                     test_acc_full_latest=test_acc_full_latest,
                                     test_acc_latest=val_acc,
                                     test_f1_micro_latest=test_f1_micro_latest,
                                     test_f1_macro_latest=test_f1_macro_latest)
            log_str_full += log_str + '\n'
            print(log_str)

            results[i] = [
                [val_acc_full, val_acc, val_auc, val_f1_micro, val_f1_macro],
                [
                    test_acc_full, test_acc, test_auc, test_f1_micro,
                    test_f1_macro
                ],
                [
                    val_acc_full_latest, val_acc_latest, val_auc_latest,
                    val_f1_micro_latest, val_f1_macro_latest
                ],
                [
                    test_acc_full_latest, test_acc_latest, test_auc_latest,
                    test_f1_micro_latest, test_f1_macro_latest
                ]
            ]

        with open(log_path + '/log' + str(i) + '.csv', 'w') as file:
            file.write(log_str_full)
            file.write('\n')
        torch.cuda.empty_cache()
    return results
Esempio n. 12
0
def citation_datasets(path="./data",
                      dataset='cora_ml',
                      alpha=0.1,
                      adj_type=None):
    # path = os.path.join(save_path, dataset)
    os.makedirs(path, exist_ok=True)
    dataset_path = os.path.join(path, '{}.npz'.format(dataset))
    g = load_npz_dataset(dataset_path)
    adj, features, labels = g['A'], g['X'], g['z']

    # Set new random splits:
    # * 20 * num_classes labels for training
    # * 500 labels for validation
    # * the rest for testing

    mask = train_test_split(labels,
                            seed=1020,
                            train_examples_per_class=20,
                            val_size=500,
                            test_size=None)

    mask['train'] = torch.from_numpy(mask['train']).bool()
    mask['val'] = torch.from_numpy(mask['val']).bool()
    mask['test'] = torch.from_numpy(mask['test']).bool()

    coo = adj.tocoo()
    values = coo.data
    indices = np.vstack((coo.row, coo.col))
    indices = torch.from_numpy(indices).long()
    features = torch.from_numpy(features.todense()).float()
    labels = torch.from_numpy(labels).long()
    if adj_type == 'un':
        print("Processing to undirected adj")
        indices = to_undirected(indices)
        edge_index, edge_weight = get_undirected_adj(indices,
                                                     features.shape[0],
                                                     features.dtype)
        data = Data(x=features,
                    edge_index=edge_index,
                    edge_weight=edge_weight,
                    y=labels)
    elif adj_type == 'pr':
        print("Processing pagerank adj matrix")
        edge_index, edge_weight = get_pr_directed_adj(alpha, indices,
                                                      features.shape[0],
                                                      features.dtype)
        data = Data(x=features,
                    edge_index=edge_index,
                    edge_weight=edge_weight,
                    y=labels)
    elif adj_type == 'appr':
        print("Processing approximate personalized pagerank adj matrix")
        edge_index, edge_weight = get_appr_directed_adj(
            alpha, indices, features.shape[0], features.dtype)
        data = Data(x=features,
                    edge_index=edge_index,
                    edge_weight=edge_weight,
                    y=labels)
    elif adj_type == 'ib':
        print("Processing first and second-order adj matrix")
        edge_index, edge_weight = get_appr_directed_adj(
            alpha, indices, features.shape[0], features.dtype)
        data = Data(x=features,
                    edge_index=edge_index,
                    edge_weight=edge_weight,
                    y=labels)
        edge_index, edge_weight = get_second_directed_adj(
            indices, features.shape[0], features.dtype)
        data.edge_index2 = edge_index
        data.edge_weight2 = edge_weight
    elif adj_type == 'or':
        print("Processing to original directed adj")
        data = Data(x=features, edge_index=indices, edge_weight=None, y=labels)
    else:
        print("Unsupported adj type.")
        sys.exit()

    data.train_mask = mask['train']
    data.val_mask = mask['val']
    data.test_mask = mask['test']
    return data
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser(
        description='Link Prediction (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--dataset', type=str, default='ogbl-citation')
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--negs', type=int, default=1)
    parser.add_argument('--gnn_type', type=str, default='gcn')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name=args.dataset)
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)
    print(data.edge_index.shape, data.num_nodes)

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=args.num_workers)

    model = GCN(data.x.size(-1),
                args.hidden_channels,
                args.hidden_channels,
                args.num_layers,
                args.dropout,
                gnn_type=args.gnn_type).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name=args.dataset)
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            t0 = time.time()
            loss = train(model, predictor, loader, optimizer, device,
                         args.negs)
            tt = time.time()
            print(tt - t0)

            if epoch % args.eval_steps == 0:
                result = test(model, predictor, data, split_edge, evaluator,
                              64 * 4 * args.batch_size, device)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
Esempio n. 14
0
def main():
    parser = argparse.ArgumentParser(description='Prepare data for Giant-XRT')
    parser.add_argument(
        '--raw-text-path',
        type=str,
        required=True,
        help="Path of raw text (.txt file, each raw correspond to a node)")
    parser.add_argument(
        '--vectorizer-config-path',
        type=str,
        required=True,
        help="a path to a json file that specify the tfidf hyper-paramters")
    parser.add_argument('--data-root-dir', type=str, default="./dataset")
    parser.add_argument('--xrt-data-dir', type=str, default="./proc_data_xrt")
    parser.add_argument('--dataset', type=str, default="ogbn-arxiv")
    parser.add_argument('--max-deg', type=int, default=1000)
    args = parser.parse_args()
    print(args)

    # Change args.save_data_dir to args.save_data_dir/args.dataset
    save_data_dir = os.path.join(args.xrt_data_dir, args.dataset)
    dataset = PygNodePropPredDataset(name=args.dataset,
                                     root=args.data_root_dir)
    data = dataset[0]
    edge_index = data.edge_index

    # Make sure edge_index is undirected!!!
    if not is_undirected(edge_index):
        edge_index = to_undirected(edge_index)
    # Filtering nodes whose number of edges >= max_degree
    Degree = degree(edge_index[0])
    Filtered_idx = torch.where(Degree < args.max_deg)[0]
    print('Number of original nodes:{}'.format(data.x.shape[0]))
    print('Number of filtered nodes:{}'.format(len(Filtered_idx)))

    # # Construct and save label matrix (adjacencey matrix) Y.
    Y_csr_all = smat.csr_matrix(to_scipy_sparse_matrix(edge_index))
    Y_csr_trn = Y_csr_all[Filtered_idx]
    smat_util.save_matrix(f"{save_data_dir}/Y.trn.npz", Y_csr_trn)
    smat_util.save_matrix(f"{save_data_dir}/Y.all.npz", Y_csr_all)
    print("Saved Y.trn.npz and Y.all.npz")

    # Apply the same filtering for raw text
    with open(args.raw_text_path, "r") as fin:
        node_text_list = fin.readlines()
    print("|node_text_list={}".format(len(node_text_list)))
    count = 0
    with open(f"{save_data_dir}/X.trn.txt", "w") as fout:
        for cur_idx, line in enumerate(node_text_list):
            if Filtered_idx[count].item() == cur_idx:
                fout.writelines(line)
                count += 1
    assert count == len(Filtered_idx), "count={}, len(Filtered_idx)={}".format(
        count, len(Filtered_idx))
    print("Saved X.trn.txt")

    # Apply the same filtering for tfidf features
    vectorizer_config = Vectorizer.load_config_from_args(
        args)  # using args.vectorizer_config_path
    preprocessor = Preprocessor.train(node_text_list,
                                      vectorizer_config,
                                      dtype=np.float32)
    preprocessor.save(f"{save_data_dir}/tfidf-model")
    X_tfidf_all = preprocessor.predict(node_text_list)
    X_tfidf_trn = X_tfidf_all[Filtered_idx]
    smat_util.save_matrix(f"{save_data_dir}/X.all.tfidf.npz", X_tfidf_all)
    smat_util.save_matrix(f"{save_data_dir}/X.trn.tfidf.npz", X_tfidf_trn)
    print("Saved X.trn.npz and X.all.npz")
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation2 (NS)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_workers', type=int, default=12)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=512)
    parser.add_argument('--lr', type=float, default=0.0005)
    parser.add_argument('--epochs', type=int, default=150)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation2')
    split_edge = dataset.get_edge_split()
    data = dataset[0]
    edge_index = to_undirected(data.edge_index, data.num_nodes)
    x = data.x.to(device)

    pos_loader = PositiveLinkNeighborSampler(edge_index,
                                             sizes=[15, 10, 5],
                                             num_nodes=x.size(0),
                                             batch_size=args.batch_size,
                                             shuffle=True,
                                             num_workers=args.num_workers)

    neg_loader = NegativeLinkNeighborSampler(edge_index,
                                             sizes=[15, 10, 5],
                                             num_nodes=x.size(0),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.num_workers)

    subgraph_loader = NeighborSampler(edge_index,
                                      node_idx=None,
                                      sizes=[-1],
                                      batch_size=4096,
                                      shuffle=False,
                                      num_workers=args.num_workers)

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    model = SAGE(x.size(-1), args.hidden_channels, args.hidden_channels,
                 args.num_layers, args.dropout).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation2')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, x, pos_loader, neg_loader,
                         optimizer, device)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')

            if epoch > 49 and epoch % args.eval_steps == 0:
                result = test(model,
                              predictor,
                              x,
                              subgraph_loader,
                              split_edge,
                              evaluator,
                              batch_size=64 * 1024,
                              device=device)
                logger.add_result(run, result)

                train_mrr, valid_mrr, test_mrr = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {train_mrr:.4f}, '
                      f'Valid: {valid_mrr:.4f}, '
                      f'Test: {test_mrr:.4f}')

        print('Neighborsampling')
        logger.print_statistics(run)
    print('Neighborsampling')
    logger.print_statistics()
Esempio n. 16
0
    def process(self):
        ids, Ns = [], []
        for r_path, p_path in zip(self.raw_paths, self.processed_paths):
            names = glob.glob(osp.join(r_path, '*.gexf'))
            # Get the graph IDs given by the file name:
            ids.append(sorted([int(i.split(os.sep)[-1][:-5]) for i in names]))

            data_list = []
            # Convert graphs in .gexf format to a NetworkX Graph:
            for i, idx in enumerate(ids[-1]):
                i = i if len(ids) == 1 else i + len(ids[0])
                G = nx.read_gexf(osp.join(r_path, f'{idx}.gexf'))
                mapping = {name: j for j, name in enumerate(G.nodes())}
                G = nx.relabel_nodes(G, mapping)
                Ns.append(G.number_of_nodes())

                edge_index = torch.tensor(list(G.edges)).t().contiguous()
                if edge_index.numel() == 0:
                    edge_index = torch.empty((2, 0), dtype=torch.long)
                edge_index = to_undirected(edge_index, num_nodes=Ns[-1])

                data = Data(edge_index=edge_index, i=i)
                data.num_nodes = Ns[-1]

                # Create a one-hot encoded feature matrix denoting the atom
                # type for the  AIDS700nef dataset:
                if self.name == 'AIDS700nef':
                    x = torch.zeros(data.num_nodes, dtype=torch.long)
                    for node, info in G.nodes(data=True):
                        x[int(node)] = self.types.index(info['type'])
                    data.x = F.one_hot(x, num_classes=len(self.types)).to(
                        torch.float)

                if self.pre_filter is not None and not self.pre_filter(data):
                    continue

                if self.pre_transform is not None:
                    data = self.pre_transform(data)

                data_list.append(data)

            torch.save(self.collate(data_list), p_path)

        assoc = {idx: i for i, idx in enumerate(ids[0])}
        assoc.update({idx: i + len(ids[0]) for i, idx in enumerate(ids[1])})

        path = osp.join(self.raw_dir, self.name, 'ged.pickle')
        mat = torch.full((len(assoc), len(assoc)), float('inf'))
        with open(path, 'rb') as f:
            obj = pickle.load(f)
            xs, ys, gs = [], [], []
            for (x, y), g in obj.items():
                xs += [assoc[x]]
                ys += [assoc[y]]
                gs += [g]
            x, y = torch.tensor(xs), torch.tensor(ys)
            g = torch.tensor(gs, dtype=torch.float)
            mat[x, y], mat[y, x] = g, g

        path = osp.join(self.processed_dir, f'{self.name}_ged.pt')
        torch.save(mat, path)

        # Calculate the normalized GEDs:
        N = torch.tensor(Ns, dtype=torch.float)
        norm_mat = mat / (0.5 * (N.view(-1, 1) + N.view(1, -1)))

        path = osp.join(self.processed_dir, f'{self.name}_norm_ged.pt')
        torch.save(norm_mat, path)
Esempio n. 17
0
File: arxiv.py Progetto: poows/GCN
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (Full-Batch)')
    parser.add_argument('--device', type=int, default=1)
    parser.add_argument('--log_steps', type=int, default=10)
    parser.add_argument('--num_layers', type=int, default=16)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0,
                        help='weight decay (L2 loss on parameters).')
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=1000)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--patience', type=int, default=200, help='patience')
    parser.add_argument('--alpha', type=float, default=0.5, help='alpha_l')
    parser.add_argument('--norm', default='bn', help='norm layer.')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-arxiv')
    split_idx = dataset.get_idx_split()
    data = dataset[0]
    data = data.to(device)
    train_idx = split_idx['train'].to(device)
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)
    Net = GCNIIdense_model
    evaluator = Evaluator(name='ogbn-arxiv')
    acc_list = []
    for run in range(args.runs):
        model = Net(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                    args.num_layers, args.dropout, args.alpha,
                    args.norm).to(device)
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
        bad_counter = 0
        best_val = 0
        final_test_acc = 0
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, data, train_idx, optimizer)
            result = test(model, data, data.y, split_idx, evaluator)
            train_acc, valid_acc, test_acc = result
            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')
            if valid_acc > best_val:
                best_val = valid_acc
                final_test_acc = test_acc
                bad_counter = 0
            else:
                bad_counter += 1

            if bad_counter == args.patience:
                break
        acc_list.append(final_test_acc * 100)
        print(run + 1, ':', acc_list[-1])
    acc_list = torch.tensor(acc_list)
    print(f'Avg Test: {acc_list.mean():.2f} ± {acc_list.std():.2f}')
Esempio n. 18
0
def get_small_dataset(dataset_name,
                      normalize_attributes=False,
                      add_self_loops=False,
                      remove_isolated_nodes=False,
                      make_undirected=False,
                      graph_availability=None,
                      seed=0,
                      create_adjacency_lists=True):
    """
    Get the pytorch_geometric.data.Data object associated with the specified dataset name.
    :param dataset_name: str => One of the datasets mentioned below.
    :param normalize_attributes: Whether the attributes for each node should be normalized to sum to 1.
    :param add_self_loops: Add self loops to the input Graph.
    :param remove_isolated_nodes: Remove isolated nodes.
    :param make_undirected: Make the Graph undirected.
    :param graph_availability: Either inductive and transductive. If transductive, all the graph nodes are available
                               during training. Otherwise, only training split nodes are available.
    :param seed: The random seed to use while splitting into train/val/test splits.
    :param create_adjacency_lists: Whether to process and store adjacency lists that can be used for efficient
                                   r-radius neighborhood sampling.
    :return: A pytorch_geometric.data.Data object for that dataset.
    """
    assert dataset_name in {
        'amazon-computers', 'amazon-photo', 'citeseer', 'coauthor-cs',
        'coauthor-physics', 'cora', 'cora-full', 'ppi', 'pubmed', 'reddit'
    }
    assert graph_availability in {'inductive', 'transductive'}

    # Compose transforms that should be applied.
    transforms = []
    if normalize_attributes:
        transforms.append(NormalizeFeatures())
    if remove_isolated_nodes:
        transforms.append(RemoveIsolatedNodes())
    if add_self_loops:
        transforms.append(AddSelfLoops())
    transforms = Compose(transforms) if transforms else None

    # Load the specified dataset and apply transforms.
    root_dir = '/tmp/{dir}'.format(dir=dataset_name)
    processed_dir = os.path.join(root_dir, dataset_name, 'processed')
    # Remove any previously pre-processed data, so pytorch_geometric can pre-process it again.
    if os.path.exists(processed_dir) and os.path.isdir(processed_dir):
        shutil.rmtree(processed_dir)

    data = None

    def split_function(y):
        return _get_train_val_test_masks(y.shape[0], y, 0.2, 0.2, seed)

    if dataset_name in ['citeseer', 'cora', 'pubmed']:
        data = Planetoid(root=root_dir,
                         name=dataset_name,
                         pre_transform=transforms,
                         split='full').data
        if seed != 0:
            data.train_mask, data.val_mask, data.test_mask = split_function(
                data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'cora-full':
        data = CoraFull(root=root_dir, pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'amazon-computers':
        data = Amazon(root=root_dir,
                      name='Computers',
                      pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'amazon-photo':
        data = Amazon(root=root_dir, name='Photo',
                      pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'coauthor-cs':
        data = Coauthor(root=root_dir, name='CS',
                        pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'coauthor-physics':
        data = Coauthor(root=root_dir,
                        name='Physics',
                        pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'reddit':
        data = Reddit(root=root_dir, pre_transform=transforms).data
        if seed != 0:
            data.train_mask, data.val_mask, data.test_mask = split_function(
                data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'ppi':
        data = SimpleNamespace()
        data.graphs = []
        for split in ['train', 'val', 'test']:
            split_data = PPI(root=root_dir,
                             split=split,
                             pre_transform=transforms)
            x_idxs = split_data.slices['x'].numpy()
            edge_idxs = split_data.slices['edge_index'].numpy()
            split_data = split_data.data
            for x_start, x_end, e_start, e_end in zip(x_idxs, x_idxs[1:],
                                                      edge_idxs,
                                                      edge_idxs[1:]):
                graph = Data(split_data.x[x_start:x_end],
                             split_data.edge_index[:, e_start:e_end],
                             y=split_data.y[x_start:x_end])
                graph.num_nodes = int(x_end - x_start)
                graph.split = split
                all_true = torch.ones(graph.num_nodes).bool()
                all_false = torch.zeros(graph.num_nodes).bool()
                graph.train_mask = all_true if split == 'train' else all_false
                graph.val_mask = all_true if split == 'val' else all_false
                graph.test_mask = all_true if split == 'test' else all_false
                data.graphs.append(graph)
        if seed != 0:
            temp_random = random.Random(seed)
            val_graphs = temp_random.sample(range(len(data.graphs)), 2)
            test_candidates = [
                graph_idx for graph_idx in range(len(data.graphs))
                if graph_idx not in val_graphs
            ]
            test_graphs = temp_random.sample(test_candidates, 2)
            for graph_idx, graph in enumerate(data.graphs):
                all_true = torch.ones(graph.num_nodes).bool()
                all_false = torch.zeros(graph.num_nodes).bool()
                graph.split = 'test' if graph_idx in test_graphs else 'val' if graph_idx in val_graphs else 'train'
                graph.train_mask = all_true if graph.split == 'train' else all_false
                graph.val_mask = all_true if graph.split == 'val' else all_false
                graph.test_mask = all_true if graph.split == 'test' else all_false

    if make_undirected:
        for graph in data.graphs:
            graph.edge_index = to_undirected(graph.edge_index, graph.num_nodes)

    LOG.info(f'Downloaded and transformed {len(data.graphs)} graph(s).')

    # Populate adjacency lists for efficient k-neighborhood sampling.
    # Only retain edges coming into a node and reverse the edges for the purpose of adjacency lists.
    LOG.info('Processing adjacency lists and degree information.')

    for graph in data.graphs:
        train_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        val_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        test_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        adjacency_lists = defaultdict(list)
        not_val_test_mask = (~graph.val_mask & ~graph.test_mask).numpy()
        val_mask = graph.val_mask.numpy()
        test_mask = graph.test_mask.numpy()

        if create_adjacency_lists:
            num_edges = graph.edge_index[0].shape[0]
            sources, dests = graph.edge_index[0].numpy(
            ), graph.edge_index[1].numpy()
            for source, dest in tqdm(zip(sources, dests),
                                     total=num_edges,
                                     leave=False):
                if not_val_test_mask[dest] and not_val_test_mask[source]:
                    train_in_degrees[dest] += 1
                    val_in_degrees[dest] += 1
                elif val_mask[dest] and not test_mask[source]:
                    val_in_degrees[dest] += 1
                test_in_degrees[dest] += 1
                adjacency_lists[dest].append(source)

        graph.adjacency_lists = dict(adjacency_lists)
        graph.train_in_degrees = torch.from_numpy(train_in_degrees).long()
        graph.val_in_degrees = torch.from_numpy(val_in_degrees).long()
        graph.test_in_degrees = torch.from_numpy(test_in_degrees).long()
        if graph_availability == 'transductive':
            graph.train_in_degrees = data.test_in_degrees
            graph.val_in_degrees = data.test_in_degrees

        graph.graph_availability = graph_availability

        # To accumulate any neighborhood perturbations to the graph.
        graph.perturbed_neighborhoods = defaultdict(set)
        graph.added_nodes = defaultdict(set)
        graph.modified_degrees = {}

        # For small datasets, cache the neighborhoods for all nodes for at least 3 different radii queries.
        graph.use_cache = True
        graph.neighborhood_cache = NeighborhoodCache(graph.num_nodes * 3)

        graph.train_mask_original = graph.train_mask
        graph.val_mask_original = graph.val_mask
        graph.test_mask_original = graph.test_mask

        graph.train_mask = torch.ones(
            graph.num_nodes).bool() & ~graph.val_mask & ~graph.test_mask

    return data
Esempio n. 19
0
def train_edges(data, mask_node, val_ratio=0.05, test_ratio=0.1):
    device = data.x.device
    num_nodes = data.num_nodes
    row, col = data.total_edge_index

    mask = row < col
    row, col = row[mask], col[mask]

    # Select train nodes
    edge_row, edge_col = row, col

    size = len(mask_node)
    # print(size)
    indice_size = 0
    for i in range(size):
        r_index = (row == mask_node[i]).nonzero()
        c_index = (col == mask_node[i]).nonzero()
        index = torch.unique(torch.cat((r_index, c_index), 0))
        if (i == 0):
            indice = index
        else:
            indice = torch.cat((indice, index), 0)
    # train_indice = indice.squeeze()
    train_indice = torch.unique(indice).squeeze()

    t_r, t_c = row[train_indice], col[train_indice]
    data.train_pos_edge_index = torch.stack([t_r, t_c], dim=0)

    edge_mask = torch.Tensor(edge_row.size(0)).type(torch.bool).to(
        data.x.device)
    edge_mask[edge_mask < 1] = True
    edge_mask = edge_mask.scatter_(0, train_indice, False)
    edge_row = edge_row[edge_mask]
    edge_col = edge_col[edge_mask]
    edge_index = torch.stack((edge_row, edge_col), dim=0)

    # Negative edges.
    neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8)
    neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool)
    neg_adj_mask[row, col] = 0  #inverse adj made

    neg_row, neg_col = neg_adj_mask.nonzero().t()

    # train negative
    for ind, i in enumerate(mask_node):
        r_index = (neg_row == i).nonzero()
        c_index = (neg_col == i).nonzero()
        index = torch.unique(torch.cat((r_index, c_index), 0))
        if (ind == 0):
            indice = index
        else:
            indice = torch.cat((indice, index), 0)
    neg_train_indice = torch.unique(indice).squeeze()

    if (train_indice.dim() == 0):
        indice_size = 1
    else:
        indice_size = train_indice.size(0)

    perm_train = random.sample(range(neg_train_indice.size(0)), indice_size)

    perm_train = torch.tensor(perm_train).to(torch.long).sort()[0]
    neg_t_r, neg_t_c = neg_row[perm_train], neg_col[perm_train]
    data.train_neg_edge_index = torch.stack([neg_t_r, neg_t_c],
                                            dim=0).to(device)

    data.train_edge_index = to_undirected(edge_index, mask_node, num_nodes)
    return data
Esempio n. 20
0
def negative_sampling(edge_index,
                      num_nodes=None,
                      num_neg_samples=None,
                      force_undirected=False):
    r"""Samples random negative edges of a graph given by :attr:`edge_index`.

    Args:
        edge_index (LongTensor): The edge indices.
        num_nodes (int, optional): The number of nodes, *i.e.*
            :obj:`max_val + 1` of :attr:`edge_index`. (default: :obj:`None`)
        num_neg_samples (int, optional): The number of negative samples to
            return. If set to :obj:`None`, will try to return a negative edge
            for every positive edge. (default: :obj:`None`)
        force_undirected (bool, optional): If set to :obj:`True`, sampled
            negative edges will be undirected. (default: :obj:`False`)

    :rtype: LongTensor
    """

    num_nodes = maybe_num_nodes(edge_index, num_nodes)
    num_neg_samples = num_neg_samples or edge_index.size(1)

    # Handle '|V|^2 - |E| < |E|' case for G = (V, E).
    num_neg_samples = min(num_neg_samples,
                          num_nodes * num_nodes - edge_index.size(1))

    if force_undirected:
        num_neg_samples = num_neg_samples // 2

        # Upper triangle indices: N + ... + 1 = N (N + 1) / 2
        rng = range((num_nodes * (num_nodes + 1)) // 2)

        # Remove edges in the lower triangle matrix.
        row, col = edge_index
        mask = row <= col
        row, col = row[mask], col[mask]

        # idx = N * i + j - i * (i+1) / 2
        idx = (row * num_nodes + col - row * (row + 1) // 2).to('cpu')
    else:
        rng = range(num_nodes**2)
        # idx = N * i + j
        idx = (edge_index[0] * num_nodes + edge_index[1]).to('cpu')

    perm = torch.tensor(random.sample(rng, num_neg_samples))
    mask = torch.from_numpy(np.isin(perm, idx)).to(torch.bool)
    rest = mask.nonzero().view(-1)
    while rest.numel() > 0:  # pragma: no cover
        tmp = torch.tensor(random.sample(rng, rest.size(0)))
        mask = torch.from_numpy(np.isin(tmp, idx)).to(torch.bool)
        perm[rest] = tmp
        rest = rest[mask.nonzero().view(-1)]

    if force_undirected:
        # (-sqrt((2 * N + 1)^2 - 8 * perm) + 2 * N + 1) / 2
        row = torch.floor((-torch.sqrt((2. * num_nodes + 1.)**2 - 8. * perm) +
                           2 * num_nodes + 1) / 2)
        col = perm - row * (2 * num_nodes - row - 1) // 2
        neg_edge_index = torch.stack([row, col], dim=0).long()
        neg_edge_index = to_undirected(neg_edge_index)
    else:
        row = perm / num_nodes
        col = perm % num_nodes
        neg_edge_index = torch.stack([row, col], dim=0).long()

    return neg_edge_index.to(edge_index.device)
Esempio n. 21
0
    def process(self):
        import networkx as nx

        ids, Ns = [], []
        # Iterating over paths for raw and processed data (train + test):
        for r_path, p_path in zip(self.raw_paths, self.processed_paths):
            # Find the paths of all raw graphs:
            names = glob.glob(osp.join(r_path, '*.gexf'))
            # Get sorted graph IDs given filename: 123.gexf -> 123
            ids.append(sorted([int(i.split(os.sep)[-1][:-5]) for i in names]))

            data_list = []
            # Convert graphs in .gexf format to a NetworkX Graph:
            for i, idx in enumerate(ids[-1]):
                i = i if len(ids) == 1 else i + len(ids[0])
                # Reading the raw `*.gexf` graph:
                G = nx.read_gexf(osp.join(r_path, f'{idx}.gexf'))
                # Mapping of nodes in `G` to a contiguous number:
                mapping = {name: j for j, name in enumerate(G.nodes())}
                G = nx.relabel_nodes(G, mapping)
                Ns.append(G.number_of_nodes())

                edge_index = torch.tensor(list(G.edges)).t().contiguous()
                if edge_index.numel() == 0:
                    edge_index = torch.empty((2, 0), dtype=torch.long)
                edge_index = to_undirected(edge_index, num_nodes=Ns[-1])

                data = Data(edge_index=edge_index, i=i)
                data.num_nodes = Ns[-1]

                # Create a one-hot encoded feature matrix denoting the atom
                # type (for the `AIDS700nef` dataset):
                if self.name == 'AIDS700nef':
                    x = torch.zeros(data.num_nodes, dtype=torch.long)
                    for node, info in G.nodes(data=True):
                        x[int(node)] = self.types.index(info['type'])
                    data.x = F.one_hot(x, num_classes=len(self.types)).to(
                        torch.float)

                if self.pre_filter is not None and not self.pre_filter(data):
                    continue

                if self.pre_transform is not None:
                    data = self.pre_transform(data)

                data_list.append(data)

            torch.save(self.collate(data_list), p_path)

        assoc = {idx: i for i, idx in enumerate(ids[0])}
        assoc.update({idx: i + len(ids[0]) for i, idx in enumerate(ids[1])})

        # Extracting ground-truth GEDs from the GED pickle file
        path = osp.join(self.raw_dir, self.name, 'ged.pickle')
        # Initialize GEDs as float('inf'):
        mat = torch.full((len(assoc), len(assoc)), float('inf'))
        with open(path, 'rb') as f:
            obj = pickle.load(f)
            xs, ys, gs = [], [], []
            for (x, y), g in obj.items():
                xs += [assoc[x]]
                ys += [assoc[y]]
                gs += [g]
            # The pickle file does not contain GEDs for test graph pairs, i.e.
            # GEDs for (test_graph, test_graph) pairs are still float('inf'):
            x, y = torch.tensor(xs), torch.tensor(ys)
            ged = torch.tensor(gs, dtype=torch.float)
            mat[x, y], mat[y, x] = ged, ged

        path = osp.join(self.processed_dir, f'{self.name}_ged.pt')
        torch.save(mat, path)

        # Calculate the normalized GEDs:
        N = torch.tensor(Ns, dtype=torch.float)
        norm_mat = mat / (0.5 * (N.view(-1, 1) + N.view(1, -1)))

        path = osp.join(self.processed_dir, f'{self.name}_norm_ged.pt')
        torch.save(norm_mat, path)
def data_downloader(dataset='Cora', data_dir='../data', data_type='static'):
    '''
    グラフデータをダウンロードする.

    Parameters:
        dataset (:obj:`str`): データセット名.'Cora', 'CiteSeer', 'factset'

    Returens:
        data (torch_geometric.data.Data): グラフデータ.
    '''

    if dataset in ['Cora', 'CiteSeer', 'PubMed']:
        data = Planetoid(data_dir, dataset, transform=T.NormalizeFeatures())[0]

    elif 'Factset' in dataset:
        year = dataset[-4:]
        print(f'processing Factset in year {year}.')
        if data_type == 'dynamic':
            df = pd.read_csv(
                data_dir +
                f'/Factset/node_features_{year}_dynamic_processed.csv'
            ).drop_duplicates(ignore_index=True, subset='code')
        else:
            df = pd.read_csv(data_dir +
                             f'/Factset/node_features_{year}_processed.csv'
                             ).drop_duplicates(ignore_index=True,
                                               subset='code')
        N = len(df)  # ノード数

        # sec_codeとノード番号の対応付け
        dic = {}
        for row in df.itertuples():
            dic[row[1]] = row[0]

        edge = pd.read_csv(data_dir + f'/Factset/edges_{year}.csv',
                           usecols=[
                               'REL_TYPE', 'SOURCE_COMPANY_TICKER',
                               'TARGET_COMPANY_TICKER'
                           ]).rename(columns={
                               'SOURCE_COMPANY_TICKER': 'source',
                               'TARGET_COMPANY_TICKER': 'target'
                           })
        edge = edge[(edge['REL_TYPE'] == 'CUSTOMER') |
                    (edge['REL_TYPE'] == 'SUPPLIER')]
        edge = edge[['source',
                     'target']].drop_duplicates(ignore_index=True,
                                                subset=['source', 'target'])

        for i in range(edge.shape[0]):
            if i in edge.index:
                source = edge.loc[i, 'source']
                target = edge.loc[i, 'target']
                edge = edge.drop(edge[(edge['source'] == target)
                                      & (edge['target'] == source)].index)

        edge = edge.applymap(lambda x: dic[x] if x in dic.keys() else np.nan)
        edge = edge.dropna(how='any').reset_index(drop=True)

        # 欠損値の処理
        df = df.iloc[:, 5:]  # sec_codeは除く
        # df = df.dropna(thresh=100, axis=1) # NaNでないデータがthresh個以上なら削除しない
        df = df.fillna(0)  # その他の列は平均で補完
        df = (df - df.mean()) / df.std()
        df = df.fillna(0)

        # X to tensor
        X = [[] for _ in range(N)]
        for row in df.itertuples():
            X[row[0]] = row[1:]
        X = torch.tensor(X, dtype=torch.float)

        # edge_index to tensor
        edge_index = torch.tensor(edge.to_numpy().T, dtype=torch.long)

        # torch_geometric.data.Data
        data = Data(x=X, edge_index=edge_index)

    print(f'dataset {dataset} has been downloaded.')
    print(f'is undirected: {data.is_undirected()}')
    print(f'contains self loops: {data.contains_self_loops()}')
    print(f'num_nodes: {data.num_nodes}')
    print(f'num_edges: {data.num_edges}\n')

    if data.is_undirected() is False:
        data.edge_index = to_undirected(data.edge_index)
        print('The graph has been transformed into undirected one.')

    return data
Esempio n. 23
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-papers100M (SIGN)')
    parser.add_argument('--file_name', type=str, default="test")
    parser.add_argument('--undirected_num_propagations', type=int, default=3)
    parser.add_argument('--directed_num_propagations', type=int, default=3)
    parser.add_argument('--undirected_dropedge_rate', type=float, default=0.4)
    parser.add_argument('--directed_dropedge_rate', type=float, default=0.2)
    parser.add_argument('--undirected', action='store_true')
    parser.add_argument('--directed', action='store_true')
    parser.add_argument('--undirected_asymm_norm', action='store_true')
    parser.add_argument('--directed_asymm_norm', action='store_true')
    parser.add_argument('--undirected_remove_diag', action='store_true')
    parser.add_argument('--undirected_set_diag', action='store_true')
    parser.add_argument('--directed_remove_diag', action='store_true')
    parser.add_argument('--directed_set_diag', action='store_true')
    args = parser.parse_args()

    if not args.directed and not args.undirected:
        raise ValueError(
            'Please specify whether you want to use undirected or directed operators (or both).'
        )

    # pre-processing ######################################################

    dataset = PygNodePropPredDataset('ogbn-papers100M')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    x = data.x.numpy()
    N = data.num_nodes

    train_idx, valid_idx, test_idx = split_idx['train'], split_idx[
        'valid'], split_idx['test']
    all_idx = torch.cat([train_idx, valid_idx, test_idx])
    mapped_train_idx = torch.arange(len(train_idx))
    mapped_valid_idx = torch.arange(len(train_idx),
                                    len(train_idx) + len(valid_idx))
    mapped_test_idx = torch.arange(
        len(train_idx) + len(valid_idx),
        len(train_idx) + len(valid_idx) + len(test_idx))

    op_dict = {}
    op_dict['label'] = data.y.data[all_idx].to(torch.long)
    op_dict['split_idx'] = {
        'train': mapped_train_idx,
        'valid': mapped_valid_idx,
        'test': mapped_test_idx
    }

    op_dict['op_embedding'] = []
    op_dict['op_embedding'].append(
        torch.from_numpy(x[all_idx]).to(torch.float))

    print('Start processing')

    if args.undirected:  # preprocess undirected operators

        print('Preparing undirected operators...')
        # subsample operator
        print('Subsampling (dropping {} %)'.format(
            100 * args.undirected_dropedge_rate))
        edge_index, _ = dropout_adj(data.edge_index,
                                    p=args.undirected_dropedge_rate,
                                    num_nodes=data.num_nodes)

        # to undirected
        print('Making the graph undirected')
        edge_index = to_undirected(edge_index, data.num_nodes)
        row, col = edge_index

        # get adj
        print('Getting adj matrix')
        adj = get_adj(row,
                      col,
                      N,
                      asymm_norm=args.undirected_asymm_norm,
                      set_diag=args.undirected_set_diag,
                      remove_diag=args.undirected_remove_diag)

        # preprocessing of features
        print('Diffusing node features')
        x = data.x.numpy()
        for _ in tqdm(range(args.undirected_num_propagations)):
            x = adj @ x
            op_dict['op_embedding'].append(
                torch.from_numpy(x[all_idx]).to(torch.float))

    if args.directed:  # preprocess directed operators

        print('Preparing directed operators...')
        # subsample operator
        print('Subsampling (dropping {} %)'.format(
            100 * args.directed_dropedge_rate))
        edge_index, _ = dropout_adj(data.edge_index,
                                    p=args.directed_dropedge_rate,
                                    num_nodes=data.num_nodes)
        row, col = edge_index

        # get adj
        print('Getting adj matrix')
        adj = get_adj(row,
                      col,
                      N,
                      asymm_norm=args.directed_asymm_norm,
                      set_diag=args.directed_set_diag,
                      remove_diag=args.directed_remove_diag)

        # preprocessing of features
        print('Diffusing node features')
        x = data.x.numpy()
        for _ in tqdm(range(args.directed_num_propagations)):
            x = adj @ x
            op_dict['op_embedding'].append(
                torch.from_numpy(x[all_idx]).to(torch.float))

        # get adj
        print('Getting transpose adj matrix')
        adj = get_adj(col,
                      row,
                      N,
                      asymm_norm=args.directed_asymm_norm,
                      set_diag=args.directed_set_diag,
                      remove_diag=args.directed_remove_diag)

        # preprocessing of features
        print('Diffusing node features')
        x = data.x.numpy()
        for _ in tqdm(range(args.directed_num_propagations)):
            x = adj @ x
            op_dict['op_embedding'].append(
                torch.from_numpy(x[all_idx]).to(torch.float))

    torch.save(op_dict, '{}.pt'.format(args.file_name))
Esempio n. 24
0
File: rgsn.py Progetto: zbn123/R-GSN
print(data)
edge_index_dict = data.edge_index_dict

# We need to add reverse edges to the heterogeneous graph.
r, c = edge_index_dict[('author', 'affiliated_with', 'institution')]
edge_index_dict[('institution', 'to', 'author')] = torch.stack([c, r])

r, c = edge_index_dict[('author', 'writes', 'paper')]
edge_index_dict[('paper', 'to', 'author')] = torch.stack([c, r])

r, c = edge_index_dict[('paper', 'has_topic', 'field_of_study')]
edge_index_dict[('field_of_study', 'to', 'paper')] = torch.stack([c, r])

# Convert to undirected paper <-> paper relation.
edge_index = to_undirected(edge_index_dict[('paper', 'cites', 'paper')])
edge_index_dict[('paper', 'cites', 'paper')] = edge_index

if not os.path.exists(args.feat_dir):
    os.mkdir(args.feat_dir)
    ###### for field_of_study
    print('###### for field_of_study')
    rows = edge_index_dict[('field_of_study', 'to', 'paper')][0]
    cols = edge_index_dict[('field_of_study', 'to', 'paper')][1]
    v = torch.ones(rows.size())
    m, n = data.num_nodes_dict['field_of_study'], data.num_nodes_dict['paper']
    y = data.x_dict['paper']
    out = gen_features(rows, cols, v, m, n, y)
    np.save(f'{args.feat_dir}/field_of_study_FEAT.npy', out)

    ###### for author
Esempio n. 25
0
def train_test_split_edges(data, val_ratio=0.05, test_ratio=0.1):
    r"""Splits the edges of a :obj:`torch_geometric.data.Data` object
    into positive and negative train/val/test edges, and adds attributes of
    `train_pos_edge_index`, `train_neg_adj_mask`, `val_pos_edge_index`,
    `val_neg_edge_index`, `test_pos_edge_index`, and `test_neg_edge_index`
    to :attr:`data`.

    Args:
        data (Data): The data object.
        val_ratio (float, optional): The ratio of positive validation
            edges. (default: :obj:`0.05`)
        test_ratio (float, optional): The ratio of positive test
            edges. (default: :obj:`0.1`)

    :rtype: :class:`torch_geometric.data.Data`
    """

    assert 'batch' not in data  # No batch-mode.

    num_nodes = data.num_nodes
    row, col = data.edge_index
    data.edge_index = None

    # Return upper triangular portion.
    mask = row < col
    row, col = row[mask], col[mask]

    n_v = int(math.floor(val_ratio * row.size(0)))
    n_t = int(math.floor(test_ratio * row.size(0)))

    # Positive edges.
    perm = torch.randperm(row.size(0))
    row, col = row[perm], col[perm]

    r, c = row[:n_v], col[:n_v]
    data.val_pos_edge_index = torch.stack([r, c], dim=0)
    r, c = row[n_v:n_v + n_t], col[n_v:n_v + n_t]
    data.test_pos_edge_index = torch.stack([r, c], dim=0)

    r, c = row[n_v + n_t:], col[n_v + n_t:]
    data.train_pos_edge_index = torch.stack([r, c], dim=0)
    data.train_pos_edge_index = to_undirected(data.train_pos_edge_index)

    # Negative edges.
    neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8)
    neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool)
    neg_adj_mask[row, col] = 0

    neg_row, neg_col = neg_adj_mask.nonzero(as_tuple=False).t()
    perm = random.sample(range(neg_row.size(0)), min(n_v + n_t,
                                                     neg_row.size(0)))
    perm = torch.tensor(perm)
    perm = perm.to(torch.long)
    neg_row, neg_col = neg_row[perm], neg_col[perm]

    neg_adj_mask[neg_row, neg_col] = 0
    data.train_neg_adj_mask = neg_adj_mask

    row, col = neg_row[:n_v], neg_col[:n_v]
    data.val_neg_edge_index = torch.stack([row, col], dim=0)

    row, col = neg_row[n_v:n_v + n_t], neg_col[n_v:n_v + n_t]
    data.test_neg_edge_index = torch.stack([row, col], dim=0)

    return data
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (GAT Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument("--num-layers",
                        type=int,
                        default=3,
                        help="number of hidden layers")
    parser.add_argument("--lr",
                        type=float,
                        default=0.0029739421726400865,
                        help="learning rate")
    parser.add_argument('--weight-decay',
                        type=float,
                        default=2.4222556964495987e-05,
                        help="weight decay")
    parser.add_argument("--num-hidden",
                        type=int,
                        default=16,
                        help="number of hidden units")
    parser.add_argument("--dropout",
                        type=float,
                        default=0.18074706609292976,
                        help="Dropout to use")
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument("--eval",
                        action='store_true',
                        help='If not set, we will only do the training part.')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-arxiv')
    split_idx = dataset.get_idx_split()

    data = dataset[0]
    x = data.x.to(device)
    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    edge_index = data.edge_index.to(device)
    edge_index = to_undirected(edge_index, data.num_nodes)
    edge_index, _ = remove_self_loops(edge_index)
    edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))

    model = GAT(num_layers=args.num_layers,
                in_feats=data.x.size(-1),
                num_hidden=args.num_hidden,
                num_classes=dataset.num_classes,
                heads=[4, 4, 4],
                dropout=args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    dur = []
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
        for epoch in range(1, 1 + args.epochs):
            t0 = time.time()
            loss = train(model, x, edge_index, y_true, train_idx, optimizer)
            if epoch >= 3:
                dur.append(time.time() - t0)
                print('Training time/epoch {}'.format(np.mean(dur)))

            if not args.eval:
                continue

            result = test(model, x, edge_index, y_true, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')
        if args.eval:
            logger.print_statistics(run)
    if args.eval:
        logger.print_statistics()
Esempio n. 27
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-papers100M (MLP)')
    parser.add_argument('--num_propagations', type=int, default=3)
    parser.add_argument('--dropedge_rate', type=float, default=0.4)
    args = parser.parse_args()

    # SGC pre-processing ######################################################

    dataset = PygNodePropPredDataset('ogbn-papers100M')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    x = data.x.numpy()
    N = data.num_nodes

    print('Making the graph undirected.')
    ### Randomly drop some edges to save computation
    data.edge_index, _ = dropout_adj(data.edge_index,
                                     p=args.dropedge_rate,
                                     num_nodes=data.num_nodes)
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)

    print(data)

    row, col = data.edge_index

    print('Computing adj...')

    adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N))
    adj = adj.set_diag()
    deg = adj.sum(dim=1).to(torch.float)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
    adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)

    adj = adj.to_scipy(layout='csr')

    train_idx, valid_idx, test_idx = split_idx['train'], split_idx[
        'valid'], split_idx['test']
    all_idx = torch.cat([train_idx, valid_idx, test_idx])
    mapped_train_idx = torch.arange(len(train_idx))
    mapped_valid_idx = torch.arange(len(train_idx),
                                    len(train_idx) + len(valid_idx))
    mapped_test_idx = torch.arange(
        len(train_idx) + len(valid_idx),
        len(train_idx) + len(valid_idx) + len(test_idx))

    sgc_dict = {}
    sgc_dict['label'] = data.y.data[all_idx].to(torch.long)
    sgc_dict['split_idx'] = {
        'train': mapped_train_idx,
        'valid': mapped_valid_idx,
        'test': mapped_test_idx
    }

    sgc_dict['sgc_embedding'] = []
    sgc_dict['sgc_embedding'].append(
        torch.from_numpy(x[all_idx]).to(torch.float))

    print('Start SGC processing')

    for _ in tqdm(range(args.num_propagations)):
        x = adj @ x
        sgc_dict['sgc_embedding'].append(
            torch.from_numpy(x[all_idx]).to(torch.float))

    print(sgc_dict)

    torch.save(sgc_dict, 'sgc_dict.pt')
Esempio n. 28
0
def main():

    args = ArgsInit().save_exp()

    if args.use_gpu:
        device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    else:
        device = torch.device('cpu')

    dataset = PygNodePropPredDataset(name=args.dataset)
    data = dataset[0]
    split_idx = dataset.get_idx_split()
    
    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)
    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    edge_index = data.edge_index.to(device)
    edge_index = to_undirected(edge_index, data.num_nodes)

    if args.self_loop:
        edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0]

    sub_dir = 'SL_{}'.format(args.self_loop)

    args.in_channels = data.x.size(-1)
    args.num_tasks = dataset.num_classes

    model = DeeperGCN(args).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    results = {'highest_valid': 0, 'final_train': 0, 'final_test': 0, 'highest_train': 0}
    start_time = time.time()
    test_accuracy = 0.0

    for epoch in range(1, args.epochs + 1):
        
        epoch_loss = train(model, x, edge_index, y_true, train_idx, optimizer, args)
        logging.info('Epoch {}, training loss {:.4f}'.format(epoch, epoch_loss))
        model.print_params(epoch=epoch)

        result = test(model, x, edge_index, y_true, split_idx, evaluator)
        logging.info(result)
        train_accuracy, valid_accuracy, test_accuracy = result

        if train_accuracy > results['highest_train']:
            results['highest_train'] = train_accuracy

        if valid_accuracy > results['highest_valid']:
            results['highest_valid'] = valid_accuracy
            results['final_train'] = train_accuracy
            results['final_test'] = test_accuracy

            #save_ckpt(model, optimizer, round(epoch_loss, 4), epoch, args.model_save_path, sub_dir, name_post='valid_best')

        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' +
              'Epoch:[{}/{}]\t Results LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}]'
              .format(epoch, args.epochs, epoch_loss, train_accuracy * 100, valid_accuracy * 100, test_accuracy * 100, results['final_test'] * 100))

    #save_ckpt(model, optimizer, round(epoch_loss, 4), epoch, args.model_save_path, sub_dir, name_post='last_epoch')

    end_time = time.time()
    total_time = end_time - start_time
    logging.info('Total time: {}'.format(time.strftime('%H:%M:%S', time.gmtime(total_time))))
    print('-' * 100)
    print("syd : Final Result Train:[{:.2f}]  Valid:[{:.2f}]  Test:[{:.2f}]"
        .format(results['final_train'] * 100, results['highest_valid'] * 100, results['final_test'] * 100))
    print('-' * 100)
Esempio n. 29
0
def run(RunnerObj, fID):
    '''
    Function to run GCN algorithm
    Requires the decoder parameter
    '''
    rSeed = RunnerObj.randSeed
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    random.seed(rSeed)
    np.random.seed(rSeed)
    torch.manual_seed(rSeed)
    torch.cuda.manual_seed(rSeed)
    
    def train():
        model.train()
        optimizer.zero_grad()
        z = model.encode(x, train_pos_edge_index)
        loss = model.recon_loss(z, train_pos_only_edge_index, train_neg_edge_index)
        loss.backward()
        optimizer.step()
        return (loss)

    def test(pos_edge_index, neg_edge_index):
        model.eval()
        with torch.no_grad():
            z = model.encode(x, train_pos_edge_index)
        yTrue, yPred = model.test(z, pos_edge_index, neg_edge_index)
        return yTrue, yPred#z, epr, ap, pred, act

    def val(pos_edge_index, neg_edge_index):
        model.eval()
        with torch.no_grad():
            z = model.encode(x, train_pos_edge_index)
        loss = model.recon_loss(z, pos_edge_index, neg_edge_index)
        return loss

    print("\n Running fold: ", fID)
    start = time.process_time()
    early_stopping = EarlyStopping(patience=100)

    read_time = time.process_time()
    print("Reading necessary input files...")
    
    exprDF = pd.read_csv(RunnerObj.inputDir.joinpath("normExp.csv"), header = 0, index_col =0)
    posE = np.load(RunnerObj.inputDir.joinpath("posE.npy"))
    negE = np.load(RunnerObj.inputDir.joinpath("negE.npy"))
    nodeDict = np.load(RunnerObj.inputDir.joinpath("nodeDict.npy"), allow_pickle = True)
    geneTFDict = np.load(RunnerObj.inputDir.joinpath("GeneTFs.npy"), allow_pickle = True)
    onlyGenes = geneTFDict.item().get('Gene')
    onlyTFs = geneTFDict.item().get('TF')
                             
    foldData = np.load(RunnerObj.inputDir.joinpath("{}CV/fold-".format(RunnerObj.CVType)+str(RunnerObj.randSeed)+"-"+str(fID)+".npy"), allow_pickle = True)
    
    train_posIdx = foldData.item().get('train_posIdx')
    test_posIdx = foldData.item().get('test_posIdx')
    
    train_negIdx = foldData.item().get('train_negIdx')
    test_negIdx = foldData.item().get('test_negIdx')

    print("Done reading inputs...")
    logging.info("Reading input files took %.3f seconds" %(time.process_time()-read_time))

    setup_time = time.process_time()
    val_posIdx = random.sample(list(train_posIdx), int(0.1*len(train_posIdx)))
    train_posIdx = list(set(train_posIdx).difference(set(val_posIdx)))

    val_negIdx = random.sample(list(train_negIdx), int(0.1*len(train_negIdx)))
    train_negIdx = list(set(train_negIdx).difference(set(val_negIdx)))
    #print(val_posIdx,val_negIdx)
    sourceNodes = posE[train_posIdx , 0]
    targetNodes = posE[train_posIdx , 1]

    #Additionally, create a copy of sourceNodes and targetNodes 
    #which would contain only the nodes present in the network without additional dummy edges
    sourceNodesCPY = posE[train_posIdx , 0]
    targetNodesCPY = posE[train_posIdx , 1]
    
    presentNodesSet = set(sourceNodes).union(set(targetNodes))
    allNodes = set(nodeDict.item().keys())
    missingSet = allNodes.difference(presentNodesSet)
    presentNodes = np.array(list(presentNodesSet))
    missingNodes = np.array(list(missingSet))
    missingTFs = np.array(list(missingSet.intersection(set(onlyTFs))))
    presentTFs = np.array(list(set(sourceNodes)))

    #print(len(missing)*len(presentTF)+len(sourceNodes)+len(missingTF)*len(allNodes))

    # For missing TFs, additionally add edges outgoing to present nodes
    for tf in missingTFs:
        for node in presentNodes:
            sourceNodes = np.append(sourceNodes, tf)
            targetNodes = np.append(targetNodes, node)
    
    # find unlinked genes and TFs and have incoming edges from all TFs
    # Add edges from every TF to every gene that is missing from the network. 
    # This step helps to connect these genes to the network so that we can transfer information from TFs to these genes 
    # and potentially compute better embeddings for these genes.
    # Note: This is one of the ways to have them be part of the graph
    if RunnerObj.params['reconnect_disconnected_nodes']:
        for node in missingNodes:
            for tf in onlyTFs:
                sourceNodes = np.append(sourceNodes, tf)
                targetNodes = np.append(targetNodes, node)
    
            
    nodeFeatures = torch.Tensor(exprDF.values)

    if RunnerObj.params['encoder'] == 'GCN':
        eIndex = to_undirected(torch.LongTensor([sourceNodes, targetNodes]))
    elif RunnerObj.params['encoder'] == 'DGCN':
        eIndex = torch.LongTensor([sourceNodes, targetNodes])
    else:
        print("Invalid encoder name: ", RunnerObj.params.encoder)
        sys.exit()

    data = Data(x=nodeFeatures, edge_index=eIndex)

    if RunnerObj.params['encoder'] == 'GCN':
        data.train_pos_edge_index = to_undirected(torch.stack([torch.LongTensor(sourceNodes),
                                                               torch.LongTensor(targetNodes)], dim=0))
        data.train_pos_only_edge_index = torch.stack([torch.LongTensor(sourceNodesCPY),
                                                      torch.LongTensor(targetNodesCPY)], dim=0)

    elif RunnerObj.params['encoder'] == 'DGCN':
        data.train_pos_edge_index = torch.stack([torch.LongTensor(sourceNodes),
                                                 torch.LongTensor(targetNodes)], dim=0)
        data.train_pos_only_edge_index = torch.stack([torch.LongTensor(sourceNodesCPY),
                                                      torch.LongTensor(targetNodesCPY)], dim=0)
    else:
        print("Invalid encoder name: ", RunnerObj.params.encoder)
        sys.exit()

        

    data.test_pos_edge_index = torch.stack([torch.LongTensor(posE[test_posIdx,0]),
                                            torch.LongTensor(posE[test_posIdx,1])], dim=0)
    data.val_pos_edge_index = torch.stack([torch.LongTensor(posE[val_posIdx,0]),
                                           torch.LongTensor(posE[val_posIdx,1])], dim=0)
    
    data.train_neg_edge_index = torch.stack([torch.LongTensor(negE[train_negIdx,0]),
                                             torch.LongTensor(negE[train_negIdx,1])], dim=0)
    data.test_neg_edge_index = torch.stack([torch.LongTensor(negE[test_negIdx,0]),
                                            torch.LongTensor(negE[test_negIdx,1])], dim=0)
    data.val_neg_edge_index = torch.stack([torch.LongTensor(negE[val_negIdx,0]),
                                           torch.LongTensor(negE[val_negIdx,1])], dim=0)

    print("Done setting up data structures...")
    logging.info("Setting up data structures took %.3f seconds" %(time.process_time()-setup_time))
    channels = RunnerObj.params['channels']
    
    dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    h_sizes = [data.num_features]
    if RunnerObj.params['hidden'] >= 1:
        for i in reversed(range(RunnerObj.params['hidden']-1)):
            h_sizes.append((i+2)*channels)
        h_sizes.append(channels)

    #model = kwargs[modelName](Encoder(data.num_features, channels)).to(dev)
    if RunnerObj.params['decoder'] == 'IP':
        model = GAEwithK(Encoder(h_sizes)).to(dev)
    elif RunnerObj.params['decoder'] == 'NW':
        model = GAEwithK(Encoder(h_sizes), TFDecoder(data.num_nodes, onlyTFs)).to(dev)
    elif RunnerObj.params['decoder'] == 'RS':
        model = GAEwithK(Encoder(h_sizes), RESCALDecoder(channels)).to(dev)
    else:
        print("Invalid decoder name:", RunnerObj.params['decoder'])
        sys.exit()
    x = data.x.to(dev)
    train_pos_edge_index = data.train_pos_edge_index.to(dev)
    train_pos_only_edge_index = data.train_pos_only_edge_index.to(dev)


    train_neg_edge_index = data.train_neg_edge_index.to(dev)

    test_pos_edge_index, test_neg_edge_index = data.test_pos_edge_index.to(dev), data.test_neg_edge_index.to(dev)

    val_pos_edge_index, val_neg_edge_index = data.val_pos_edge_index.to(dev), data.val_neg_edge_index.to(dev)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

    lossDict = {'epoch':[],'TrLoss':[], 'valLoss':  []}
    last10Models = []
    print("Running  %s-%s..." %(RunnerObj.params['encoder'],RunnerObj.params['decoder']))

    if not os.path.exists(RunnerObj.outPrefix):
        os.mkdir(RunnerObj.outPrefix)
    fullPath = Path(str(RunnerObj.outPrefix) + '/randID-' +  str(RunnerObj.randSeed) + '/' + RunnerObj.params['encoder'] + '-' +RunnerObj.params['decoder'])
    if not os.path.exists(fullPath):
        os.makedirs(fullPath)

    training_summary_path = os.path.join(fullPath, 'trainingSummary', 'hiddenlayer-'+str(RunnerObj.params['hidden']))
    if not os.path.exists(training_summary_path):
        os.makedirs(training_summary_path)

    writer  = SummaryWriter(os.path.join(training_summary_path, 'fold-'+str(fID)))

    for epoch in tqdm(range(1, RunnerObj.params['epochs'])):
        TrLoss = train()
        valLoss = val(val_pos_edge_index, val_neg_edge_index)
        
        lossDict['epoch'].append(epoch)
        lossDict['TrLoss'].append(TrLoss.item())
        lossDict['valLoss'].append(valLoss.item())
        

        #print(TrLoss.item(), valLoss.item())

        writer.add_scalar("TrainingLoss/train", TrLoss.item(), epoch)
        writer.add_scalar("ValLoss/train", valLoss.item(), epoch)
        print(TrLoss.item(), valLoss.item())

        early_stopping(valLoss.item())
        if early_stopping.early_stop:
            break


        #if np.mean(lossDict['valLoss'][-10:]) - valLoss.item()<= 1e-6 and epoch > RunnerObj.params['min_epochs']:
            #break

    logging.info("[Fold %s]: %.3f seconds in %s epochs" %(fID, time.process_time()-start, epoch))
    writer.flush()

    yTrue, yPred = test(data.test_pos_edge_index, data.test_neg_edge_index)
    torch.save(model.state_dict(), os.path.join(training_summary_path, 'fold-'+str(fID), 'model'))
    
    testIndices = torch.cat((data.test_pos_edge_index, data.test_neg_edge_index), axis=1).detach().cpu().numpy()
    edgeLength = testIndices.shape[1]
    outMatrix = np.vstack((testIndices, yTrue, yPred, np.array([fID]*edgeLength), np.array([RunnerObj.params['hidden']]*edgeLength), np.array([RunnerObj.params['channels']]*edgeLength)))
    
    output_path =  fullPath / 'rankedEdges.csv'
    training_stats_file_name = fullPath / 'trainingstats.csv'
    
    outDF = pd.DataFrame(outMatrix.T, columns=['Gene1','Gene2','TrueScore','PredScore', 'CVID', 'HiddenLayers', 'Channels'])
    outDF = outDF.astype({'Gene1': int,'Gene2': int, 'CVID': int, 'HiddenLayers': int, 'Channels': int})
    
    outDF['Gene1'] = outDF.Gene1.map(nodeDict.item())
    outDF['Gene2'] = outDF.Gene2.map(nodeDict.item())
    outDF.to_csv(output_path, index=False, mode='a', header=not os.path.exists(output_path))

    if os.path.isfile(training_stats_file_name):
        training_stats_file = open(training_stats_file_name,'a')
        training_stats_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(fID, RunnerObj.params['encoder']+'-'+RunnerObj.params['decoder'], RunnerObj.randSeed, RunnerObj.params['hidden'], \
            RunnerObj.params['channels'], len(presentNodesSet), len(allNodes), len(set(missingNodes)), len(missingTFs),  len(presentTFs),
            len(onlyTFs),len(sourceNodesCPY),len(sourceNodes), len(train_negIdx), len(test_posIdx), len(test_negIdx), len(val_posIdx), len(val_negIdx)))
    else:
        training_stats_file = open(training_stats_file_name, 'w')
        training_stats_file.write('Fold\tAlgorithm\trandID\t#HiddenLayers\tChannels\tPresentNodes\tAllNodes\tMissingNodes\tMissingTFs\tPresentTFs\tOnlyTFs\tPositiveTrainingEdges \
            \tPositiveTrainingEdgesWithDummyEdges\tNegativeTrainingEdges\tPositiveTestEdges\tNegativeTestEdges\tPositiveValidationEdges\tNegativeValidationEdges\n')
        training_stats_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(fID, RunnerObj.params['encoder']+'-'+RunnerObj.params['decoder'], RunnerObj.randSeed, RunnerObj.params['hidden'], \
            RunnerObj.params['channels'], len(presentNodesSet), len(allNodes), len(set(missingNodes)), len(missingTFs),  len(presentTFs),
            len(onlyTFs),len(sourceNodesCPY),len(sourceNodes), len(train_negIdx), len(test_posIdx), len(test_negIdx), len(val_posIdx), len(val_negIdx)))
    
    writer.close()
Esempio n. 30
0
parser.add_argument('--epochs', type=int, default=500)
parser.add_argument('--runs', type=int, default=10)
parser.add_argument('--rezero', action='store_true')
args = parser.parse_args()
print(args)

device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

dataset = PygNodePropPredDataset(name='ogbn-arxiv')
split_idx = dataset.get_idx_split()

data = dataset[0]

edge_index = data.edge_index.to(device)
edge_index = to_undirected(edge_index, data.num_nodes)
adj_0 = SparseTensor(row=edge_index[0], col=edge_index[1])
# Pre-compute GCN normalization.
adj = adj_0.set_diag()
deg = adj.sum(dim=1).to(torch.float)
deg_inv_sqrt = deg.pow(-0.5)
deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)


class l_GCN(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(l_GCN, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels