Exemple #1
0
 def prepare_data(self):
     dataset = MAG240MDataset(self.data_dir)
     path = f'{dataset.dir}/paper_to_paper_symmetric.pt'
     if not osp.exists(path):
         t = time.perf_counter()
         print('Converting adjacency matrix...', end=' ', flush=True)
         edge_index = dataset.edge_index('paper', 'cites', 'paper')
         edge_index = torch.from_numpy(edge_index)
         adj_t = SparseTensor(
             row=edge_index[0], col=edge_index[1],
             sparse_sizes=(dataset.num_papers, dataset.num_papers),
             is_sorted=True)
         torch.save(adj_t.to_symmetric(), path)
         print(f'Done! [{time.perf_counter() - t:.2f}s]')
def test_to_symmetric(device):
    row = torch.tensor([0, 0, 0, 1, 1], device=device)
    col = torch.tensor([0, 1, 2, 0, 2], device=device)
    value = torch.arange(1, 6, device=device)
    mat = SparseTensor(row=row, col=col, value=value)
    assert not mat.is_symmetric()

    mat = mat.to_symmetric()

    assert mat.is_symmetric()
    assert mat.to_dense().tolist() == [
        [2, 6, 3],
        [6, 0, 5],
        [3, 5, 0],
    ]
Exemple #3
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        path = f'{dataset.dir}/paper_to_paper_symmetric.pt'
        if not osp.exists(path):  # Will take approximately 5 minutes...
            t = time.perf_counter()
            print('Converting adjacency matrix...', end=' ', flush=True)
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=edge_index[0],
                                 col=edge_index[1],
                                 sparse_sizes=(dataset.num_papers,
                                               dataset.num_papers),
                                 is_sorted=True)
            torch.save(adj_t.to_symmetric(), path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/full_adj_t.pt'
        if not osp.exists(path):  # Will take approximately 16 minutes...
            t = time.perf_counter()
            print('Merging adjacency matrices...', end=' ', flush=True)

            row, col, _ = torch.load(
                f'{dataset.dir}/paper_to_paper_symmetric.pt').coo()
            rows, cols = [row], [col]

            edge_index = dataset.edge_index('author', 'writes', 'paper')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            rows += [row, col]
            cols += [col, row]

            edge_index = dataset.edge_index('author', 'institution')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors
            rows += [row, col]
            cols += [col, row]

            edge_types = [
                torch.full(x.size(), i, dtype=torch.int8)
                for i, x in enumerate(rows)
            ]

            row = torch.cat(rows, dim=0)
            del rows
            col = torch.cat(cols, dim=0)
            del cols

            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            perm = (N * row).add_(col).numpy().argsort()
            perm = torch.from_numpy(perm)
            row = row[perm]
            col = col[perm]

            edge_type = torch.cat(edge_types, dim=0)[perm]
            del edge_types

            full_adj_t = SparseTensor(row=row,
                                      col=col,
                                      value=edge_type,
                                      sparse_sizes=(N, N),
                                      is_sorted=True)

            torch.save(full_adj_t, path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/full_feat.npy'
        done_flag_path = f'{dataset.dir}/full_feat_done.txt'
        if not osp.exists(done_flag_path):  # Will take ~3 hours...
            t = time.perf_counter()
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            dim_chunk_size = 64
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat
            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            for i in tqdm(range(0, dataset.num_papers, node_chunk_size)):
                j = min(i + node_chunk_size, dataset.num_papers)
                x[i:j] = paper_feat[i:j]

            edge_index = dataset.edge_index('author', 'writes', 'paper')
            row, col = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=row,
                                 col=col,
                                 sparse_sizes=(dataset.num_authors,
                                               dataset.num_papers),
                                 is_sorted=True)

            # Processing 64-dim subfeatures at a time for memory efficiency.
            print('Generating author features...')
            for i in tqdm(range(0, self.num_features, dim_chunk_size)):
                j = min(i + dim_chunk_size, self.num_features)
                inputs = get_col_slice(paper_feat,
                                       start_row_idx=0,
                                       end_row_idx=dataset.num_papers,
                                       start_col_idx=i,
                                       end_col_idx=j)
                inputs = torch.from_numpy(inputs)
                outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                del inputs
                save_col_slice(x_src=outputs,
                               x_dst=x,
                               start_row_idx=dataset.num_papers,
                               end_row_idx=dataset.num_papers +
                               dataset.num_authors,
                               start_col_idx=i,
                               end_col_idx=j)
                del outputs

            edge_index = dataset.edge_index('author', 'institution')
            row, col = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=col,
                                 col=row,
                                 sparse_sizes=(dataset.num_institutions,
                                               dataset.num_authors),
                                 is_sorted=False)

            print('Generating institution features...')
            # Processing 64-dim subfeatures at a time for memory efficiency.
            for i in tqdm(range(0, self.num_features, dim_chunk_size)):
                j = min(i + dim_chunk_size, self.num_features)
                inputs = get_col_slice(x,
                                       start_row_idx=dataset.num_papers,
                                       end_row_idx=dataset.num_papers +
                                       dataset.num_authors,
                                       start_col_idx=i,
                                       end_col_idx=j)
                inputs = torch.from_numpy(inputs)
                outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                del inputs
                save_col_slice(x_src=outputs,
                               x_dst=x,
                               start_row_idx=dataset.num_papers +
                               dataset.num_authors,
                               end_row_idx=N,
                               start_col_idx=i,
                               end_col_idx=j)
                del outputs

            x.flush()
            del x
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

            with open(done_flag_path, 'w') as f:
                f.write('done')
Exemple #4
0
    path = dataset.root + '/mag240m/paper_to_paper_symmetric_gcn.pt'
    if osp.exists(path):
        adj_t = torch.load(path)
    else:
        path_sym = dataset.root + '/mag240m/paper_to_paper_symmetric.pt'
        if osp.exists(path_sym):
            adj_t = torch.load(path_sym)
        else:
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=edge_index[0],
                                 col=edge_index[1],
                                 sparse_sizes=(dataset.num_papers,
                                               dataset.num_papers),
                                 is_sorted=True)
            adj_t = adj_t.to_symmetric()
            torch.save(adj_t, path_sym)
        adj_t = gcn_norm(adj_t, add_self_loops=True)
        torch.save(adj_t, path)
    print(f'Done! [{time.perf_counter() - t:.2f}s]')

    train_idx = dataset.get_idx_split('train')
    valid_idx = dataset.get_idx_split('valid')
    test_idx = dataset.get_idx_split('test')
    num_features = dataset.num_paper_features

    pbar = tqdm(total=args.num_layers * (num_features // 128))
    pbar.set_description('Pre-processing node features')

    for j in range(0, num_features, 128):  # Run spmm in chunks...
        x = dataset.paper_feat[:, j:min(j + 128, num_features)]
Exemple #5
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-MAG (Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=2)
    parser.add_argument('--hidden_channels', type=int, default=64)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-mag')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    # We do not consider those attributes for now.
    data.node_year_dict = None
    data.edge_reltype_dict = None

    print(data)

    # Convert to new transposed `SparseTensor` format and add reverse edges.
    data.adj_t_dict = {}
    for keys, (row, col) in data.edge_index_dict.items():
        sizes = (data.num_nodes_dict[keys[0]], data.num_nodes_dict[keys[2]])
        adj = SparseTensor(row=row, col=col, sparse_sizes=sizes)
        # adj = SparseTensor(row=row, col=col)[:sizes[0], :sizes[1]] # TEST
        if keys[0] != keys[2]:
            data.adj_t_dict[keys] = adj.t()
            data.adj_t_dict[(keys[2], 'to', keys[0])] = adj
        else:
            data.adj_t_dict[keys] = adj.to_symmetric()
    data.edge_index_dict = None

    x_types = list(data.x_dict.keys())
    edge_types = list(data.adj_t_dict.keys())

    model = RGCN(data.x_dict['paper'].size(-1), args.hidden_channels,
                 dataset.num_classes, args.num_layers, args.dropout,
                 data.num_nodes_dict, x_types, edge_types)

    data = data.to(device)
    model = model.to(device)
    train_idx = split_idx['train']['paper'].to(device)

    evaluator = Evaluator(name='ogbn-mag')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, data.x_dict, data.adj_t_dict,
                         data.y_dict['paper'], train_idx, optimizer)
            result = test(model, data.x_dict, data.adj_t_dict,
                          data.y_dict['paper'], split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
Exemple #6
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        path = f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt'
        if not osp.exists(path):  # Will take approximately 5 minutes...
            t = time.perf_counter()
            print('Converting adjacency matrix...', end=' ', flush=True)
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=edge_index[0],
                                 col=edge_index[1],
                                 sparse_sizes=(dataset.num_papers,
                                               dataset.num_papers),
                                 is_sorted=True)
            torch.save(adj_t.to_symmetric(), path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.root}/mag240m/full_adj_t.pt'
        if not osp.exists(path):  # Will take approximately 16 minutes...
            t = time.perf_counter()
            print('Merging adjacency matrices...', end=' ', flush=True)

            row, col, _ = torch.load(
                f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt').coo()
            rows, cols = [row], [col]

            edge_index = dataset.edge_index('author', 'writes', 'paper')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            rows += [row, col]
            cols += [col, row]

            edge_index = dataset.edge_index('author', 'institution')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors
            rows += [row, col]
            cols += [col, row]

            edge_types = [
                torch.full(x.size(), i, dtype=torch.int8)
                for i, x in enumerate(rows)
            ]

            row = torch.cat(rows, dim=0)
            del rows
            col = torch.cat(cols, dim=0)
            del cols

            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            perm = (N * row).add_(col).numpy().argsort()
            perm = torch.from_numpy(perm)
            row = row[perm]
            col = col[perm]

            edge_type = torch.cat(edge_types, dim=0)[perm]
            del edge_types

            full_adj_t = SparseTensor(row=row,
                                      col=col,
                                      value=edge_type,
                                      sparse_sizes=(N, N),
                                      is_sorted=True)

            torch.save(full_adj_t, path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.root}/mag240m/full_feat.npy'
        # indicate whether full_feat processing has been finished or not
        done_flag_path = f'{dataset.root}/mag240m/full_feat_done.txt'
        if not osp.exists(
                done_flag_path):  # Will take approximately 3 hours...
            if os.path.exists(path):
                print('Removing unfinished full_feat.npy')
                os.remove(path)

            try:
                t = time.perf_counter()
                print('Generating full feature matrix...')

                N = (dataset.num_papers + dataset.num_authors +
                     dataset.num_institutions)

                x = np.memmap(path,
                              dtype=np.float16,
                              mode='w+',
                              shape=(N, self.num_features))
                paper_feat = dataset.paper_feat
                dim_chunk = 64
                chunk = 100000

                print('Copying paper features...')
                for i in tqdm(range(0, dataset.num_papers,
                                    chunk)):  # Copy paper features.
                    end_idx = min(i + chunk, dataset.num_papers)
                    x[i:end_idx] = paper_feat[i:end_idx]

                edge_index = dataset.edge_index('author', 'writes', 'paper')
                row, col = torch.from_numpy(edge_index)
                adj_t = SparseTensor(row=row,
                                     col=col,
                                     sparse_sizes=(dataset.num_authors,
                                                   dataset.num_papers),
                                     is_sorted=True)

                print('Generating author features...')
                # processing 64-dim subfeatures at a time for memory efficiency
                for i in tqdm(range(0, self.num_features, dim_chunk)):
                    end_idx = min(i + dim_chunk, self.num_features)
                    inputs = torch.from_numpy(
                        get_col_slice(paper_feat,
                                      start_row_idx=0,
                                      end_row_idx=len(paper_feat),
                                      start_col_idx=i,
                                      end_col_idx=end_idx))
                    outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                    del inputs
                    save_col_slice(x_from=outputs,
                                   x_to=x,
                                   start_row_idx=dataset.num_papers,
                                   end_row_idx=dataset.num_papers +
                                   dataset.num_authors,
                                   start_col_idx=i,
                                   end_col_idx=end_idx)
                    del outputs

                edge_index = dataset.edge_index('author', 'institution')
                row, col = torch.from_numpy(edge_index)
                adj_t = SparseTensor(row=col,
                                     col=row,
                                     sparse_sizes=(dataset.num_institutions,
                                                   dataset.num_authors),
                                     is_sorted=False)

                print('Generating institution features...')
                # processing 64-dim subfeatures at a time for memory efficiency
                for i in tqdm(range(0, self.num_features, dim_chunk)):
                    end_idx = min(i + dim_chunk, self.num_features)
                    inputs = torch.from_numpy(
                        get_col_slice(x,
                                      start_row_idx=dataset.num_papers,
                                      end_row_idx=dataset.num_papers +
                                      dataset.num_authors,
                                      start_col_idx=i,
                                      end_col_idx=end_idx))
                    outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                    del inputs
                    save_col_slice(x_from=outputs,
                                   x_to=x,
                                   start_row_idx=dataset.num_papers +
                                   dataset.num_authors,
                                   end_row_idx=N,
                                   start_col_idx=i,
                                   end_col_idx=end_idx)
                    del outputs

                x.flush()
                del x
                print(f'Done! [{time.perf_counter() - t:.2f}s]')

                with open(done_flag_path, 'w') as f:
                    f.write('done')

            except Exception:
                traceback.print_exc()
                if os.path.exists(path):
                    print(
                        'Removing unfinished full feat file due to exception')
                    os.remove(path)
                exit(-1)