Esempio n. 1
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)
        edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl'

        t = time.perf_counter()
        if not osp.exists(edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]

            edge_index = np.vstack((edge_index, edges_new))
            edge_index = np.unique(edge_index, axis=0)

            graph = Graph(edge_index, sorted=True)
            graph.adj_dst_index
            graph.dump(edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        np.random.seed(self.seed)
        self.train_idx = dataset.get_idx_split('train')
        np.random.shuffle(self.train_idx)

        self.val_idx = dataset.get_idx_split('valid')
        self.test_idx = dataset.get_idx_split('test')

        self.x = dataset.paper_feat
        self.y = dataset.all_paper_label

        self.graph = Graph.load(edge_path, mmap_mode='r+')
        log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        graph_file_list = []
        paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split'
        graph_file_list.append(paper_edge_path)
        t = time.perf_counter()
        if not osp.exists(paper_edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]
            edge_index = np.vstack((edge_index, edges_new))
            edge_types = np.full([
                edge_index.shape[0],
            ], 0, dtype='int32')

            graph = Graph(edge_index,
                          num_nodes=dataset.num_papers,
                          edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(paper_edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 1, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 2, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 3, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 4, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        path = f'{dataset.dir}/full_feat.npy'

        author_feat_path = f'{dataset.dir}/author_feat.npy'

        institution_feat_path = f'{dataset.dir}/institution_feat.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat

            author_feat = np.memmap(author_feat_path,
                                    dtype=np.float16,
                                    shape=(dataset.num_authors,
                                           self.num_features),
                                    mode='r')

            institution_feat = np.memmap(institution_feat_path,
                                         dtype=np.float16,
                                         shape=(dataset.num_institutions,
                                                self.num_features),
                                         mode='r')

            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_feat[i:j]
            del paper_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_feat[i - start_idx:j - start_idx]
            del author_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_feat[i - start_idx:j - start_idx]
            del institution_feat

            x.flush()
            del x
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        np.random.seed(self.seed)
        self.train_idx = dataset.get_idx_split('train')
        self.val_idx = dataset.get_idx_split('valid')
        valid_name = os.path.join(self.valid_path, self.valid_name)
        self.val_idx_cv = np.load(valid_name)
        log.info(self.train_idx.shape)
        log.info(self.val_idx.shape)
        log.info(self.val_idx_cv.shape)
        self.test_idx = dataset.get_idx_split('test')

        ##self.val_idx = np.load('valid_idx_eval.npy')
        def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
            def cal_angle(position, hid_idx):
                return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)

            def get_posi_angle_vec(position):
                return [cal_angle(position, hid_j) for hid_j in range(d_hid)]

            sinusoid_table = np.array(
                [get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
            sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
            sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:,
                                                            1::2])  # dim 2i+1
            return sinusoid_table

        N = dataset.num_papers + dataset.num_authors + dataset.num_institutions
        self.x = np.memmap(f'{dataset.dir}/full_feat.npy',
                           dtype=np.float16,
                           mode='r',
                           shape=(N, self.num_features))

        self.id_x = np.memmap(f'{dataset.dir}/{self.m2v_file}',
                              dtype=np.float16,
                              mode='r',
                              shape=(N, self.m2v_dim))

        self.y = dataset.all_paper_label

        self.graph = [
            Graph.load(edge_path, mmap_mode='r+')
            for edge_path in graph_file_list
        ]

        self.pos = get_sinusoid_encoding_table(200, 768)
        #self.year = dataset.all_paper_year
        year_file = f'{dataset.dir}/all_feat_year.npy'
        self.year = np.memmap(year_file, dtype=np.int32, mode='r', shape=(N, ))
        self.num_papers = dataset.num_papers
        self.train_idx_label = None
        self.train_idx_data = None
        log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Esempio n. 3
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        graph_file_list = []
        paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split'
        graph_file_list.append(paper_edge_path)
        t = time.perf_counter()
        if not osp.exists(paper_edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]
            edge_index = np.vstack((edge_index, edges_new))
            edge_types = np.full([
                edge_index.shape[0],
            ], 0, dtype='int32')

            graph = Graph(edge_index,
                          num_nodes=dataset.num_papers,
                          edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(paper_edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 1, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 2, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 3, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 4, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        path = f'{dataset.dir}/full_feat.npy'

        author_feat_path = f'{dataset.dir}/author_feat.npy'

        institution_feat_path = f'{dataset.dir}/institution_feat.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat

            author_feat = np.memmap(author_feat_path,
                                    dtype=np.float16,
                                    shape=(dataset.num_authors,
                                           self.num_features),
                                    mode='r')

            institution_feat = np.memmap(institution_feat_path,
                                         dtype=np.float16,
                                         shape=(dataset.num_institutions,
                                                self.num_features),
                                         mode='r')

            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_feat[i:j]
            del paper_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_feat[i - start_idx:j - start_idx]
            del author_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_feat[i - start_idx:j - start_idx]
            del institution_feat

            x.flush()
            del x
            print(f'feature x Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/all_feat_year.npy'

        author_year_path = f'{dataset.dir}/author_feat_year.npy'

        institution_year_path = f'{dataset.dir}/institution_feat_year.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full year matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_year_feat = dataset.all_paper_year

            author_year_feat = np.memmap(author_year_path,
                                         dtype=np.int32,
                                         shape=(dataset.num_authors),
                                         mode='r')

            institution_year_feat = np.memmap(institution_year_path,
                                              dtype=np.int32,
                                              shape=(dataset.num_institutions),
                                              mode='r')

            x = np.memmap(path, dtype=np.int32, mode='w+', shape=(N))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_year_feat[i:j]
            del paper_year_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_year_feat[i - start_idx:j - start_idx]
            del author_year_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_year_feat[i - start_idx:j - start_idx]
            del institution_year_feat

            x.flush()
            del x
            print(f'year feature Done! [{time.perf_counter() - t:.2f}s]')
Esempio n. 4
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl'
        t = time.perf_counter()
        if not osp.exists(paper_edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]

            edge_index = np.vstack((edge_index, edges_new))
            #            edge_index = np.unique(edge_index, axis=0)

            graph = Graph(edge_index)
            graph.adj_dst_index
            graph.dump(paper_edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        edge_path = f'{dataset.dir}/full_edge_symmetric_pgl'
        t = time.perf_counter()
        if not osp.exists(edge_path):
            log.info('Converting adjacency matrix...')

            # paper
            log.info('adding paper edges')
            paper_graph = Graph.load(paper_edge_path, mmap_mode='r+')
            rows, cols = [paper_graph.edges[:, 0]], [paper_graph.edges[:, 1]]

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            row += dataset.num_papers
            rows += [row, col]
            cols += [col, row]

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors
            rows += [row, col]
            cols += [col, row]

            # edge_type
            log.info('building edge type')
            edge_types = [
                np.full(x.shape, i, dtype='int32') for i, x in enumerate(rows)
            ]
            edge_types = np.concatenate(edge_types, axis=0)

            log.info('building edges')
            row = np.concatenate(rows, axis=0)
            del rows

            col = np.concatenate(cols, axis=0)
            del cols

            edge_index = np.stack([row, col], axis=1)
            N = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            full_graph = Graph(edge_index,
                               num_nodes=N,
                               edge_feat={'edge_type': edge_types})
            full_graph.adj_dst_index
            full_graph.dump(edge_path)
            log.info(
                f'Done! finish full_edge [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/full_feat.npy'

        author_feat_path = f'{dataset.dir}/author_feat.npy'

        institution_feat_path = f'{dataset.dir}/institution_feat.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat

            author_feat = np.memmap(author_feat_path,
                                    dtype=np.float16,
                                    shape=(dataset.num_authors,
                                           self.num_features),
                                    mode='r')

            institution_feat = np.memmap(institution_feat_path,
                                         dtype=np.float16,
                                         shape=(dataset.num_institutions,
                                                self.num_features),
                                         mode='r')

            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_feat[i:j]
            del paper_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_feat[i - start_idx:j - start_idx]
            del author_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_feat[i - start_idx:j - start_idx]
            del institution_feat

            x.flush()
            del x
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        np.random.seed(self.seed)
        self.train_idx = dataset.get_idx_split('train')
        np.random.shuffle(self.train_idx)

        self.val_idx = dataset.get_idx_split('valid')
        self.test_idx = dataset.get_idx_split('test')

        N = dataset.num_papers + dataset.num_authors + dataset.num_institutions
        self.x = np.memmap(f'{dataset.dir}/full_feat.npy',
                           dtype=np.float16,
                           mode='r',
                           shape=(N, self.num_features))

        self.y = dataset.all_paper_label

        self.graph = Graph.load(edge_path, mmap_mode='r+')
        self.graph._edge_feat['edge_type'] = self.graph._edge_feat[
            'edge_type'].astype('int32')

        log.info(f'Done! [{time.perf_counter() - t:.2f}s]')