Exemple #1
0
    def setup(self, stage: Optional[str] = None):
        t = time.perf_counter()
        print("Reading dataset...", end=" ", flush=True)
        dataset = MAG240MDataset(self.data_dir)

        self.train_idx = torch.from_numpy(dataset.get_idx_split("train"))
        self.train_idx = self.train_idx
        self.train_idx.share_memory_()
        self.val_idx = torch.from_numpy(dataset.get_idx_split("valid"))
        self.val_idx.share_memory_()
        self.test_idx = torch.from_numpy(dataset.get_idx_split("test"))
        self.test_idx.share_memory_()

        N = dataset.num_papers + dataset.num_authors + dataset.num_institutions

        self.x = np.memmap(
            f"{dataset.dir}/full_feat.npy",
            dtype=np.float16,
            mode="r",
            shape=(N, self.num_features),
        )
        self.y = torch.from_numpy(dataset.all_paper_label)

        path = f"{dataset.dir}/full_adj_t.pt"
        self.adj_t = torch.load(path)
        print(f"Done! [{time.perf_counter() - t:.2f}s]")
Exemple #2
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)
        edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl'

        t = time.perf_counter()
        if not osp.exists(edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]

            edge_index = np.vstack((edge_index, edges_new))
            edge_index = np.unique(edge_index, axis=0)

            graph = Graph(edge_index, sorted=True)
            graph.adj_dst_index
            graph.dump(edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        np.random.seed(self.seed)
        self.train_idx = dataset.get_idx_split('train')
        np.random.shuffle(self.train_idx)

        self.val_idx = dataset.get_idx_split('valid')
        self.test_idx = dataset.get_idx_split('test')

        self.x = dataset.paper_feat
        self.y = dataset.all_paper_label

        self.graph = Graph.load(edge_path, mmap_mode='r+')
        log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Exemple #3
0
    def setup(self, stage: Optional[str] = None):
        t = time.perf_counter()
        print('Reading dataset...', end=' ', flush=True)
        dataset = MAG240MDataset(self.data_dir)

        self.train_idx = torch.from_numpy(dataset.get_idx_split('train'))
        self.train_idx = self.train_idx
        self.train_idx.share_memory_()
        self.val_idx = torch.from_numpy(dataset.get_idx_split('valid'))
        self.val_idx.share_memory_()
        self.test_idx = torch.from_numpy(dataset.get_idx_split('test'))
        self.test_idx.share_memory_()

        N = dataset.num_papers + dataset.num_authors + dataset.num_institutions

        x = np.memmap(f'{dataset.dir}/full_feat.npy',
                      dtype=np.float16,
                      mode='r',
                      shape=(N, self.num_features))

        if self.in_memory:
            self.x = np.empty((N, self.num_features), dtype=np.float16)
            self.x[:] = x
            self.x = torch.from_numpy(self.x).share_memory_()
        else:
            self.x = x

        self.y = torch.from_numpy(dataset.all_paper_label)

        path = f'{dataset.dir}/full_adj_t.pt'
        self.adj_t = torch.load(path)
        print(f'Done! [{time.perf_counter() - t:.2f}s]')
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        log.info(dataset.num_authors)
        log.info(dataset.num_institutions)

        author_path = f'{dataset.dir}/author_feat_year.npy'
        path = f'{dataset.dir}/institution_feat_year.npy'
        t = time.perf_counter()
        if not osp.exists(path):
            log.info('get institution_feat...')

            author_feat = np.memmap(author_path,
                                    dtype=np.int32,
                                    mode='r',
                                    shape=(dataset.num_authors, ))
            author_feat = author_feat[:]
            author_feat = np.expand_dims(author_feat, axis=1)
            # author
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            log.info(edge_index.shape)
            institution_graph = Graph(edge_index,
                                      num_nodes=dataset.num_institutions)
            institution_graph.tensor()
            log.info('finish institution graph')

            institution_x = np.memmap(path,
                                      dtype=np.int32,
                                      mode='w+',
                                      shape=(dataset.num_institutions, ))

            degree = paddle.zeros(shape=[dataset.num_institutions, 1],
                                  dtype='float32')
            temp_one = paddle.ones(shape=[edge_index.shape[0], 1],
                                   dtype='float32')
            degree = scatter(degree,
                             overwrite=False,
                             index=institution_graph.edges[:, 1],
                             updates=temp_one)
            log.info('finish degree')

            inputs = author_feat

            inputs = paddle.to_tensor(inputs, dtype='float32')
            outputs = institution_graph.send_recv(inputs)
            outputs = outputs / degree
            outputs = outputs.astype('int32').numpy()

            del inputs
            save_col_slice(x_src=outputs,
                           x_dst=institution_x,
                           start_row_idx=0,
                           end_row_idx=dataset.num_institutions)
            del outputs

            institution_x.flush()
            del institution_x
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Exemple #5
0
 def prepare_data(self):
     dataset = MAG240MDataset(self.data_dir)
     path = f'{dataset.dir}/paper_to_paper_symmetric.pt'
     if not osp.exists(path):
         t = time.perf_counter()
         print('Converting adjacency matrix...', end=' ', flush=True)
         edge_index = dataset.edge_index('paper', 'cites', 'paper')
         edge_index = torch.from_numpy(edge_index)
         adj_t = SparseTensor(
             row=edge_index[0], col=edge_index[1],
             sparse_sizes=(dataset.num_papers, dataset.num_papers),
             is_sorted=True)
         torch.save(adj_t.to_symmetric(), path)
         print(f'Done! [{time.perf_counter() - t:.2f}s]')
Exemple #6
0
def split_dir(data_dir, output_dit):
    dataset = MAG240MDataset(data_dir)
    valid_idx = dataset.get_idx_split("valid")
    np.random.seed(999)
    np.random.shuffle(valid_idx)
    star = 0
    end = len(valid_idx)
    part = len(valid_idx) // 5 + 1

    for idx, x in enumerate(range(0, end, part)):
        y = min(x + part, end)
        valid_part = valid_idx[x:y]
        print(valid_part.shape)
        path_p = f"{output_dit}/valid_{idx}"
        np.save(path_p, valid_part)
Exemple #7
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)
        
        log.info(dataset.num_authors)
        log.info(dataset.num_papers)
        
        path = f'{dataset.dir}/author_feat_year.npy'
        t = time.perf_counter()
        if not osp.exists(path):
            log.info('get author_feat...')
            paper_feat = dataset.all_paper_year
            paper_feat = np.expand_dims(paper_feat, axis=1)
            # author
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            edge_index = np.stack([col, row], axis=1)
            log.info(edge_index.shape)
            author_graph = BiGraph(edge_index, dst_num_nodes=dataset.num_authors)
            author_graph.tensor()
            log.info('finish author graph')
            
            author_x_year = np.memmap(path, dtype=np.int32, mode='w+',
                          shape=(dataset.num_authors,))
            
            degree = paddle.zeros(shape=[dataset.num_authors, 1], dtype='float32')
            degree += 1e-10
            temp_one = paddle.ones(shape=[edge_index.shape[0], 1], dtype='float32')
            degree = scatter(degree, author_graph.edges[:, 1], temp_one, overwrite=False)
            log.info('finish degree')
            
#             inputs = get_col_slice(paper_feat, start_row_idx=0,
#                                    end_row_idx=dataset.num_papers)
            inputs = paper_feat
            inputs = paddle.to_tensor(inputs, dtype='float32')
            outputs = author_graph.send_recv(inputs)
            outputs = outputs / degree
            outputs = outputs.astype('int32').numpy()
                
            del inputs
            save_col_slice(
                x_src=outputs, x_dst=author_x_year, start_row_idx=0,
                end_row_idx=dataset.num_authors)
            del outputs
                
            author_x_year.flush()
            del author_x_year
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Exemple #8
0
def create_split_dir(source_dir, split_dir):
    dataset = MAG240MDataset(source_dir)
    split_dir_exists = os.path.exists(split_dir)
    if not split_dir_exists:
        os.mkdir(split_dir)

    valid_idx = dataset.get_idx_split("valid")
    np.random.seed(999)
    np.random.shuffle(valid_idx)
    end = len(valid_idx)
    part = len(valid_idx) // 5 + 1

    for idx, x in enumerate(range(0, end, part)):
        y = min(x + part, end)
        valid_part = valid_idx[x: y]
        print(valid_part.shape)
        split_file = f"{split_dir}/valid_{idx}"
        np.save(split_file, valid_part)
Exemple #9
0
    def setup(self, stage: Optional[str] = None):
        t = time.perf_counter()
        print('Reading dataset...', end=' ', flush=True)
        dataset = MAG240MDataset(self.data_dir)

        self.train_idx = torch.from_numpy(dataset.get_idx_split('train'))
        self.train_idx = self.train_idx
        self.train_idx.share_memory_()
        self.val_idx = torch.from_numpy(dataset.get_idx_split('valid'))
        self.val_idx.share_memory_()
        self.test_idx = torch.from_numpy(dataset.get_idx_split('test'))
        self.test_idx.share_memory_()

        self.x = dataset.paper_feat
        self.y = torch.from_numpy(dataset.all_paper_label)

        path = f'{dataset.dir}/paper_to_paper_symmetric.pt'
        self.adj_t = torch.load(path)
        print(f'Done! [{time.perf_counter() - t:.2f}s]')
Exemple #10
0
def get_result(config, eval_all=False):
    dataset = MAG240MDataset(config.data_dir)
    evaluator = MAG240MEvaluator()
    file = 'model_result_temp'
    sudo_label = np.memmap(file,
                           dtype=np.float32,
                           mode='r',
                           shape=(121751666, 153))
    file = "ck_result.txt"
    wf = open(file, 'a', encoding='utf-8')
    label = dataset.all_paper_label
    if eval_all:
        valid_idx = dataset.get_idx_split('valid')
        pred = sudo_label[valid_idx]
        save_path = os.path.join(config.valid_path, "all_eval_result")
        np.save(save_path, pred)
        y_pred = pred.argmax(1)
        y_true = label[valid_idx]
        valid_acc = evaluator.eval({'y_true': y_true, 'y_pred': y_pred})['acc']
        print("all eval result\n")
        print(f"valid_acc: {valid_acc}\n")
        wf.write("all eval result\n")
        wf.write(f"valid_acc: {valid_acc}\n")

    else:
        valid_path = os.path.join(config.valid_path, config.valid_name)
        valid_idx = np.load(valid_path)
        test_idx = dataset.get_idx_split('test')

        pred = sudo_label[valid_idx]
        y_pred = pred.argmax(1)
        y_true = label[valid_idx]
        valid_acc = evaluator.eval({'y_true': y_true, 'y_pred': y_pred})['acc']
        print(f"eval cv {config.valid_name} result\n")
        print(f"valid_acc: {valid_acc}\n")
        wf.write(f"eval cv {config.valid_name} result\n")
        wf.write(f"valid_acc: {valid_acc}\n")

        save_path_test = os.path.join(config.valid_path, config.test_name)
        pred_test = sudo_label[test_idx]
        print(pred_test.shape)
        np.save(save_path_test, pred_test)
Exemple #11
0
from tqdm import tqdm

import torch
import numpy as np
from torch_sparse import SparseTensor
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from ogb.lsc import MAG240MDataset
from root import ROOT

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_layers', type=int, default=3),
    args = parser.parse_args()
    print(args)

    dataset = MAG240MDataset(ROOT)

    t = time.perf_counter()
    print('Reading adjacency matrix...', end=' ', flush=True)
    path = dataset.root + '/mag240m/paper_to_paper_symmetric_gcn.pt'
    if osp.exists(path):
        adj_t = torch.load(path)
    else:
        path_sym = dataset.root + '/mag240m/paper_to_paper_symmetric.pt'
        if osp.exists(path_sym):
            adj_t = torch.load(path_sym)
        else:
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=edge_index[0],
                                 col=edge_index[1],
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        graph_file_list = []
        paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split'
        graph_file_list.append(paper_edge_path)
        t = time.perf_counter()
        if not osp.exists(paper_edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]
            edge_index = np.vstack((edge_index, edges_new))
            edge_types = np.full([
                edge_index.shape[0],
            ], 0, dtype='int32')

            graph = Graph(edge_index,
                          num_nodes=dataset.num_papers,
                          edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(paper_edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 1, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 2, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 3, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 4, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        path = f'{dataset.dir}/full_feat.npy'

        author_feat_path = f'{dataset.dir}/author_feat.npy'

        institution_feat_path = f'{dataset.dir}/institution_feat.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat

            author_feat = np.memmap(author_feat_path,
                                    dtype=np.float16,
                                    shape=(dataset.num_authors,
                                           self.num_features),
                                    mode='r')

            institution_feat = np.memmap(institution_feat_path,
                                         dtype=np.float16,
                                         shape=(dataset.num_institutions,
                                                self.num_features),
                                         mode='r')

            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_feat[i:j]
            del paper_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_feat[i - start_idx:j - start_idx]
            del author_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_feat[i - start_idx:j - start_idx]
            del institution_feat

            x.flush()
            del x
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        np.random.seed(self.seed)
        self.train_idx = dataset.get_idx_split('train')
        self.val_idx = dataset.get_idx_split('valid')
        valid_name = os.path.join(self.valid_path, self.valid_name)
        self.val_idx_cv = np.load(valid_name)
        log.info(self.train_idx.shape)
        log.info(self.val_idx.shape)
        log.info(self.val_idx_cv.shape)
        self.test_idx = dataset.get_idx_split('test')

        ##self.val_idx = np.load('valid_idx_eval.npy')
        def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
            def cal_angle(position, hid_idx):
                return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)

            def get_posi_angle_vec(position):
                return [cal_angle(position, hid_j) for hid_j in range(d_hid)]

            sinusoid_table = np.array(
                [get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
            sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
            sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:,
                                                            1::2])  # dim 2i+1
            return sinusoid_table

        N = dataset.num_papers + dataset.num_authors + dataset.num_institutions
        self.x = np.memmap(f'{dataset.dir}/full_feat.npy',
                           dtype=np.float16,
                           mode='r',
                           shape=(N, self.num_features))

        self.id_x = np.memmap(f'{dataset.dir}/{self.m2v_file}',
                              dtype=np.float16,
                              mode='r',
                              shape=(N, self.m2v_dim))

        self.y = dataset.all_paper_label

        self.graph = [
            Graph.load(edge_path, mmap_mode='r+')
            for edge_path in graph_file_list
        ]

        self.pos = get_sinusoid_encoding_table(200, 768)
        #self.year = dataset.all_paper_year
        year_file = f'{dataset.dir}/all_feat_year.npy'
        self.year = np.memmap(year_file, dtype=np.int32, mode='r', shape=(N, ))
        self.num_papers = dataset.num_papers
        self.train_idx_label = None
        self.train_idx_data = None
        log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Exemple #13
0
                        help='Path to the features of all nodes.')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='Number of epochs.')
    parser.add_argument('--model-path',
                        type=str,
                        default='./model.pt',
                        help='Path to store the best model.')
    parser.add_argument('--submission-path',
                        type=str,
                        default='./results',
                        help='Submission directory.')
    args = parser.parse_args()

    dataset = MAG240MDataset(root=args.rootdir)

    print('Loading graph')
    (g, ), _ = dgl.load_graphs(args.graph_path)
    g = g.formats(['csc'])

    print('Loading features')
    paper_offset = dataset.num_authors + dataset.num_institutions
    num_nodes = paper_offset + dataset.num_papers
    num_features = dataset.num_paper_features
    feats = np.memmap(args.full_feature_path,
                      mode='r',
                      dtype='float16',
                      shape=(num_nodes, num_features))

    if args.epochs != 0:
Exemple #14
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        graph_file_list = []
        paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split'
        graph_file_list.append(paper_edge_path)
        t = time.perf_counter()
        if not osp.exists(paper_edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]
            edge_index = np.vstack((edge_index, edges_new))
            edge_types = np.full([
                edge_index.shape[0],
            ], 0, dtype='int32')

            graph = Graph(edge_index,
                          num_nodes=dataset.num_papers,
                          edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(paper_edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 1, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 2, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 3, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 4, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        path = f'{dataset.dir}/full_feat.npy'

        author_feat_path = f'{dataset.dir}/author_feat.npy'

        institution_feat_path = f'{dataset.dir}/institution_feat.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat

            author_feat = np.memmap(author_feat_path,
                                    dtype=np.float16,
                                    shape=(dataset.num_authors,
                                           self.num_features),
                                    mode='r')

            institution_feat = np.memmap(institution_feat_path,
                                         dtype=np.float16,
                                         shape=(dataset.num_institutions,
                                                self.num_features),
                                         mode='r')

            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_feat[i:j]
            del paper_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_feat[i - start_idx:j - start_idx]
            del author_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_feat[i - start_idx:j - start_idx]
            del institution_feat

            x.flush()
            del x
            print(f'feature x Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/all_feat_year.npy'

        author_year_path = f'{dataset.dir}/author_feat_year.npy'

        institution_year_path = f'{dataset.dir}/institution_feat_year.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full year matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_year_feat = dataset.all_paper_year

            author_year_feat = np.memmap(author_year_path,
                                         dtype=np.int32,
                                         shape=(dataset.num_authors),
                                         mode='r')

            institution_year_feat = np.memmap(institution_year_path,
                                              dtype=np.int32,
                                              shape=(dataset.num_institutions),
                                              mode='r')

            x = np.memmap(path, dtype=np.int32, mode='w+', shape=(N))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_year_feat[i:j]
            del paper_year_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_year_feat[i - start_idx:j - start_idx]
            del author_year_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_year_feat[i - start_idx:j - start_idx]
            del institution_year_feat

            x.flush()
            del x
            print(f'year feature Done! [{time.perf_counter() - t:.2f}s]')
Exemple #15
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        log.info(dataset.num_authors)
        log.info(dataset.num_institutions)

        author_path = f'{dataset.dir}/author_feat.npy'
        path = f'{dataset.dir}/institution_feat.npy'
        t = time.perf_counter()
        if not osp.exists(path):
            log.info('get institution_feat...')

            author_feat = np.memmap(author_path,
                                    dtype=np.float16,
                                    shape=(dataset.num_authors,
                                           self.num_features),
                                    mode='r')
            # author
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            log.info(edge_index.shape)
            institution_graph = BiGraph(edge_index,
                                        dst_num_nodes=dataset.num_institutions)
            institution_graph.tensor()
            log.info('finish institution graph')

            institution_x = np.memmap(path,
                                      dtype=np.float16,
                                      mode='w+',
                                      shape=(dataset.num_institutions,
                                             self.num_features))
            dim_chunk_size = 64

            degree = paddle.zeros(shape=[dataset.num_institutions, 1],
                                  dtype='float32')
            temp_one = paddle.ones(shape=[edge_index.shape[0], 1],
                                   dtype='float32')
            degree = scatter(degree,
                             overwrite=False,
                             index=institution_graph.edges[:, 1],
                             updates=temp_one)
            log.info('finish degree')

            for i in tqdm(range(0, self.num_features, dim_chunk_size)):
                j = min(i + dim_chunk_size, self.num_features)
                inputs = get_col_slice(author_feat,
                                       start_row_idx=0,
                                       end_row_idx=dataset.num_authors,
                                       start_col_idx=i,
                                       end_col_idx=j)

                inputs = paddle.to_tensor(inputs, dtype='float32')
                outputs = institution_graph.send_recv(inputs)
                outputs = outputs / degree
                outputs = outputs.astype('float16').numpy()

                del inputs
                save_col_slice(x_src=outputs,
                               x_dst=institution_x,
                               start_row_idx=0,
                               end_row_idx=dataset.num_institutions,
                               start_col_idx=i,
                               end_col_idx=j)
                del outputs

            institution_x.flush()
            del institution_x
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Exemple #16
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl'
        t = time.perf_counter()
        if not osp.exists(paper_edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]

            edge_index = np.vstack((edge_index, edges_new))
            #            edge_index = np.unique(edge_index, axis=0)

            graph = Graph(edge_index)
            graph.adj_dst_index
            graph.dump(paper_edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        edge_path = f'{dataset.dir}/full_edge_symmetric_pgl'
        t = time.perf_counter()
        if not osp.exists(edge_path):
            log.info('Converting adjacency matrix...')

            # paper
            log.info('adding paper edges')
            paper_graph = Graph.load(paper_edge_path, mmap_mode='r+')
            rows, cols = [paper_graph.edges[:, 0]], [paper_graph.edges[:, 1]]

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            row += dataset.num_papers
            rows += [row, col]
            cols += [col, row]

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors
            rows += [row, col]
            cols += [col, row]

            # edge_type
            log.info('building edge type')
            edge_types = [
                np.full(x.shape, i, dtype='int32') for i, x in enumerate(rows)
            ]
            edge_types = np.concatenate(edge_types, axis=0)

            log.info('building edges')
            row = np.concatenate(rows, axis=0)
            del rows

            col = np.concatenate(cols, axis=0)
            del cols

            edge_index = np.stack([row, col], axis=1)
            N = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            full_graph = Graph(edge_index,
                               num_nodes=N,
                               edge_feat={'edge_type': edge_types})
            full_graph.adj_dst_index
            full_graph.dump(edge_path)
            log.info(
                f'Done! finish full_edge [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/full_feat.npy'

        author_feat_path = f'{dataset.dir}/author_feat.npy'

        institution_feat_path = f'{dataset.dir}/institution_feat.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat

            author_feat = np.memmap(author_feat_path,
                                    dtype=np.float16,
                                    shape=(dataset.num_authors,
                                           self.num_features),
                                    mode='r')

            institution_feat = np.memmap(institution_feat_path,
                                         dtype=np.float16,
                                         shape=(dataset.num_institutions,
                                                self.num_features),
                                         mode='r')

            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_feat[i:j]
            del paper_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_feat[i - start_idx:j - start_idx]
            del author_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_feat[i - start_idx:j - start_idx]
            del institution_feat

            x.flush()
            del x
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        np.random.seed(self.seed)
        self.train_idx = dataset.get_idx_split('train')
        np.random.shuffle(self.train_idx)

        self.val_idx = dataset.get_idx_split('valid')
        self.test_idx = dataset.get_idx_split('test')

        N = dataset.num_papers + dataset.num_authors + dataset.num_institutions
        self.x = np.memmap(f'{dataset.dir}/full_feat.npy',
                           dtype=np.float16,
                           mode='r',
                           shape=(N, self.num_features))

        self.y = dataset.all_paper_label

        self.graph = Graph.load(edge_path, mmap_mode='r+')
        self.graph._edge_feat['edge_type'] = self.graph._edge_feat[
            'edge_type'].astype('int32')

        log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Exemple #17
0
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--hidden_channels', type=int, default=512)
    parser.add_argument('--num_layers', type=int, default=2),
    parser.add_argument('--no_batch_norm', action='store_true')
    parser.add_argument('--relu_last', action='store_true')
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--batch_size', type=int, default=380000)
    parser.add_argument('--epochs', type=int, default=1000)
    args = parser.parse_args()
    print(args)

    torch.manual_seed(12345)
    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'

    dataset = MAG240MDataset('~/datasets/OGB')
    evaluator = MAG240MEvaluator()

    train_idx = dataset.get_idx_split('train')
    valid_idx = dataset.get_idx_split('valid')

    t = time.perf_counter()
    print('Reading training node features...', end=' ', flush=True)
    x_train = dataset.paper_feat[train_idx]
    x_train = torch.from_numpy(x_train).to(torch.float).to(device)
    print(f'Done! [{time.perf_counter() - t:.2f}s]')
    t = time.perf_counter()
    print('Reading validation node features...', end=' ', flush=True)
    x_valid = dataset.paper_feat[valid_idx]
    x_valid = torch.from_numpy(x_valid).to(torch.float).to(device)
    print(f'Done! [{time.perf_counter() - t:.2f}s]')
Exemple #18
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        path = f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt'
        if not osp.exists(path):  # Will take approximately 5 minutes...
            t = time.perf_counter()
            print('Converting adjacency matrix...', end=' ', flush=True)
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=edge_index[0],
                                 col=edge_index[1],
                                 sparse_sizes=(dataset.num_papers,
                                               dataset.num_papers),
                                 is_sorted=True)
            torch.save(adj_t.to_symmetric(), path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.root}/mag240m/full_adj_t.pt'
        if not osp.exists(path):  # Will take approximately 16 minutes...
            t = time.perf_counter()
            print('Merging adjacency matrices...', end=' ', flush=True)

            row, col, _ = torch.load(
                f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt').coo()
            rows, cols = [row], [col]

            edge_index = dataset.edge_index('author', 'writes', 'paper')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            rows += [row, col]
            cols += [col, row]

            edge_index = dataset.edge_index('author', 'institution')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors
            rows += [row, col]
            cols += [col, row]

            edge_types = [
                torch.full(x.size(), i, dtype=torch.int8)
                for i, x in enumerate(rows)
            ]

            row = torch.cat(rows, dim=0)
            del rows
            col = torch.cat(cols, dim=0)
            del cols

            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            perm = (N * row).add_(col).numpy().argsort()
            perm = torch.from_numpy(perm)
            row = row[perm]
            col = col[perm]

            edge_type = torch.cat(edge_types, dim=0)[perm]
            del edge_types

            full_adj_t = SparseTensor(row=row,
                                      col=col,
                                      value=edge_type,
                                      sparse_sizes=(N, N),
                                      is_sorted=True)

            torch.save(full_adj_t, path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.root}/mag240m/full_feat.npy'
        # indicate whether full_feat processing has been finished or not
        done_flag_path = f'{dataset.root}/mag240m/full_feat_done.txt'
        if not osp.exists(
                done_flag_path):  # Will take approximately 3 hours...
            if os.path.exists(path):
                print('Removing unfinished full_feat.npy')
                os.remove(path)

            try:
                t = time.perf_counter()
                print('Generating full feature matrix...')

                N = (dataset.num_papers + dataset.num_authors +
                     dataset.num_institutions)

                x = np.memmap(path,
                              dtype=np.float16,
                              mode='w+',
                              shape=(N, self.num_features))
                paper_feat = dataset.paper_feat
                dim_chunk = 64
                chunk = 100000

                print('Copying paper features...')
                for i in tqdm(range(0, dataset.num_papers,
                                    chunk)):  # Copy paper features.
                    end_idx = min(i + chunk, dataset.num_papers)
                    x[i:end_idx] = paper_feat[i:end_idx]

                edge_index = dataset.edge_index('author', 'writes', 'paper')
                row, col = torch.from_numpy(edge_index)
                adj_t = SparseTensor(row=row,
                                     col=col,
                                     sparse_sizes=(dataset.num_authors,
                                                   dataset.num_papers),
                                     is_sorted=True)

                print('Generating author features...')
                # processing 64-dim subfeatures at a time for memory efficiency
                for i in tqdm(range(0, self.num_features, dim_chunk)):
                    end_idx = min(i + dim_chunk, self.num_features)
                    inputs = torch.from_numpy(
                        get_col_slice(paper_feat,
                                      start_row_idx=0,
                                      end_row_idx=len(paper_feat),
                                      start_col_idx=i,
                                      end_col_idx=end_idx))
                    outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                    del inputs
                    save_col_slice(x_from=outputs,
                                   x_to=x,
                                   start_row_idx=dataset.num_papers,
                                   end_row_idx=dataset.num_papers +
                                   dataset.num_authors,
                                   start_col_idx=i,
                                   end_col_idx=end_idx)
                    del outputs

                edge_index = dataset.edge_index('author', 'institution')
                row, col = torch.from_numpy(edge_index)
                adj_t = SparseTensor(row=col,
                                     col=row,
                                     sparse_sizes=(dataset.num_institutions,
                                                   dataset.num_authors),
                                     is_sorted=False)

                print('Generating institution features...')
                # processing 64-dim subfeatures at a time for memory efficiency
                for i in tqdm(range(0, self.num_features, dim_chunk)):
                    end_idx = min(i + dim_chunk, self.num_features)
                    inputs = torch.from_numpy(
                        get_col_slice(x,
                                      start_row_idx=dataset.num_papers,
                                      end_row_idx=dataset.num_papers +
                                      dataset.num_authors,
                                      start_col_idx=i,
                                      end_col_idx=end_idx))
                    outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                    del inputs
                    save_col_slice(x_from=outputs,
                                   x_to=x,
                                   start_row_idx=dataset.num_papers +
                                   dataset.num_authors,
                                   end_row_idx=N,
                                   start_col_idx=i,
                                   end_col_idx=end_idx)
                    del outputs

                x.flush()
                del x
                print(f'Done! [{time.perf_counter() - t:.2f}s]')

                with open(done_flag_path, 'w') as f:
                    f.write('done')

            except Exception:
                traceback.print_exc()
                if os.path.exists(path):
                    print(
                        'Removing unfinished full feat file due to exception')
                    os.remove(path)
                exit(-1)
Exemple #19
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        path = f'{dataset.dir}/paper_to_paper_symmetric.pt'
        if not osp.exists(path):  # Will take approximately 5 minutes...
            t = time.perf_counter()
            print('Converting adjacency matrix...', end=' ', flush=True)
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=edge_index[0],
                                 col=edge_index[1],
                                 sparse_sizes=(dataset.num_papers,
                                               dataset.num_papers),
                                 is_sorted=True)
            torch.save(adj_t.to_symmetric(), path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/full_adj_t.pt'
        if not osp.exists(path):  # Will take approximately 16 minutes...
            t = time.perf_counter()
            print('Merging adjacency matrices...', end=' ', flush=True)

            row, col, _ = torch.load(
                f'{dataset.dir}/paper_to_paper_symmetric.pt').coo()
            rows, cols = [row], [col]

            edge_index = dataset.edge_index('author', 'writes', 'paper')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            rows += [row, col]
            cols += [col, row]

            edge_index = dataset.edge_index('author', 'institution')
            row, col = torch.from_numpy(edge_index)
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors
            rows += [row, col]
            cols += [col, row]

            edge_types = [
                torch.full(x.size(), i, dtype=torch.int8)
                for i, x in enumerate(rows)
            ]

            row = torch.cat(rows, dim=0)
            del rows
            col = torch.cat(cols, dim=0)
            del cols

            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            perm = (N * row).add_(col).numpy().argsort()
            perm = torch.from_numpy(perm)
            row = row[perm]
            col = col[perm]

            edge_type = torch.cat(edge_types, dim=0)[perm]
            del edge_types

            full_adj_t = SparseTensor(row=row,
                                      col=col,
                                      value=edge_type,
                                      sparse_sizes=(N, N),
                                      is_sorted=True)

            torch.save(full_adj_t, path)
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/full_feat.npy'
        done_flag_path = f'{dataset.dir}/full_feat_done.txt'
        if not osp.exists(done_flag_path):  # Will take ~3 hours...
            t = time.perf_counter()
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            dim_chunk_size = 64
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat
            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            for i in tqdm(range(0, dataset.num_papers, node_chunk_size)):
                j = min(i + node_chunk_size, dataset.num_papers)
                x[i:j] = paper_feat[i:j]

            edge_index = dataset.edge_index('author', 'writes', 'paper')
            row, col = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=row,
                                 col=col,
                                 sparse_sizes=(dataset.num_authors,
                                               dataset.num_papers),
                                 is_sorted=True)

            # Processing 64-dim subfeatures at a time for memory efficiency.
            print('Generating author features...')
            for i in tqdm(range(0, self.num_features, dim_chunk_size)):
                j = min(i + dim_chunk_size, self.num_features)
                inputs = get_col_slice(paper_feat,
                                       start_row_idx=0,
                                       end_row_idx=dataset.num_papers,
                                       start_col_idx=i,
                                       end_col_idx=j)
                inputs = torch.from_numpy(inputs)
                outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                del inputs
                save_col_slice(x_src=outputs,
                               x_dst=x,
                               start_row_idx=dataset.num_papers,
                               end_row_idx=dataset.num_papers +
                               dataset.num_authors,
                               start_col_idx=i,
                               end_col_idx=j)
                del outputs

            edge_index = dataset.edge_index('author', 'institution')
            row, col = torch.from_numpy(edge_index)
            adj_t = SparseTensor(row=col,
                                 col=row,
                                 sparse_sizes=(dataset.num_institutions,
                                               dataset.num_authors),
                                 is_sorted=False)

            print('Generating institution features...')
            # Processing 64-dim subfeatures at a time for memory efficiency.
            for i in tqdm(range(0, self.num_features, dim_chunk_size)):
                j = min(i + dim_chunk_size, self.num_features)
                inputs = get_col_slice(x,
                                       start_row_idx=dataset.num_papers,
                                       end_row_idx=dataset.num_papers +
                                       dataset.num_authors,
                                       start_col_idx=i,
                                       end_col_idx=j)
                inputs = torch.from_numpy(inputs)
                outputs = adj_t.matmul(inputs, reduce='mean').numpy()
                del inputs
                save_col_slice(x_src=outputs,
                               x_dst=x,
                               start_row_idx=dataset.num_papers +
                               dataset.num_authors,
                               end_row_idx=N,
                               start_col_idx=i,
                               end_col_idx=j)
                del outputs

            x.flush()
            del x
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

            with open(done_flag_path, 'w') as f:
                f.write('done')
Exemple #20
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        log.info(dataset.num_authors)
        log.info(dataset.num_papers)

        path = f'{dataset.dir}/author_feat.npy'
        t = time.perf_counter()
        if not osp.exists(path):
            log.info('get author_feat...')
            paper_feat = dataset.paper_feat
            # author
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            edge_index = np.stack([col, row], axis=1)
            log.info(edge_index.shape)
            author_graph = Graph(edge_index, num_nodes=dataset.num_authors)
            author_graph.tensor()
            log.info('finish author graph')

            author_x = np.memmap(path,
                                 dtype=np.float16,
                                 mode='w+',
                                 shape=(dataset.num_authors,
                                        self.num_features))
            dim_chunk_size = 64

            degree = paddle.zeros(shape=[dataset.num_authors, 1],
                                  dtype='float32')
            degree += 1e-10
            temp_one = paddle.ones(shape=[edge_index.shape[0], 1],
                                   dtype='float32')
            degree = scatter(degree,
                             author_graph.edges[:, 1],
                             temp_one,
                             overwrite=False)
            log.info('finish degree')

            for i in tqdm(range(0, self.num_features, dim_chunk_size)):
                j = min(i + dim_chunk_size, self.num_features)
                inputs = get_col_slice(paper_feat,
                                       start_row_idx=0,
                                       end_row_idx=dataset.num_papers,
                                       start_col_idx=i,
                                       end_col_idx=j)

                inputs = paddle.to_tensor(inputs, dtype='float32')
                outputs = author_graph.send_recv(inputs)
                outputs = outputs / degree
                outputs = outputs.astype('float16').numpy()

                del inputs
                save_col_slice(x_src=outputs,
                               x_dst=author_x,
                               start_row_idx=0,
                               end_row_idx=dataset.num_authors,
                               start_col_idx=i,
                               end_col_idx=j)
                del outputs

            author_x.flush()
            del author_x
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')