Beispiel #1
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        log.info(dataset.num_authors)
        log.info(dataset.num_institutions)

        author_path = f'{dataset.dir}/author_feat_year.npy'
        path = f'{dataset.dir}/institution_feat_year.npy'
        t = time.perf_counter()
        if not osp.exists(path):
            log.info('get institution_feat...')

            author_feat = np.memmap(author_path,
                                    dtype=np.int32,
                                    mode='r',
                                    shape=(dataset.num_authors, ))
            author_feat = author_feat[:]
            author_feat = np.expand_dims(author_feat, axis=1)
            # author
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            log.info(edge_index.shape)
            institution_graph = Graph(edge_index,
                                      num_nodes=dataset.num_institutions)
            institution_graph.tensor()
            log.info('finish institution graph')

            institution_x = np.memmap(path,
                                      dtype=np.int32,
                                      mode='w+',
                                      shape=(dataset.num_institutions, ))

            degree = paddle.zeros(shape=[dataset.num_institutions, 1],
                                  dtype='float32')
            temp_one = paddle.ones(shape=[edge_index.shape[0], 1],
                                   dtype='float32')
            degree = scatter(degree,
                             overwrite=False,
                             index=institution_graph.edges[:, 1],
                             updates=temp_one)
            log.info('finish degree')

            inputs = author_feat

            inputs = paddle.to_tensor(inputs, dtype='float32')
            outputs = institution_graph.send_recv(inputs)
            outputs = outputs / degree
            outputs = outputs.astype('int32').numpy()

            del inputs
            save_col_slice(x_src=outputs,
                           x_dst=institution_x,
                           start_row_idx=0,
                           end_row_idx=dataset.num_institutions)
            del outputs

            institution_x.flush()
            del institution_x
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Beispiel #2
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        log.info(dataset.num_authors)
        log.info(dataset.num_papers)

        path = f'{dataset.dir}/author_feat.npy'
        t = time.perf_counter()
        if not osp.exists(path):
            log.info('get author_feat...')
            paper_feat = dataset.paper_feat
            # author
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            edge_index = np.stack([col, row], axis=1)
            log.info(edge_index.shape)
            author_graph = Graph(edge_index, num_nodes=dataset.num_authors)
            author_graph.tensor()
            log.info('finish author graph')

            author_x = np.memmap(path,
                                 dtype=np.float16,
                                 mode='w+',
                                 shape=(dataset.num_authors,
                                        self.num_features))
            dim_chunk_size = 64

            degree = paddle.zeros(shape=[dataset.num_authors, 1],
                                  dtype='float32')
            degree += 1e-10
            temp_one = paddle.ones(shape=[edge_index.shape[0], 1],
                                   dtype='float32')
            degree = scatter(degree,
                             author_graph.edges[:, 1],
                             temp_one,
                             overwrite=False)
            log.info('finish degree')

            for i in tqdm(range(0, self.num_features, dim_chunk_size)):
                j = min(i + dim_chunk_size, self.num_features)
                inputs = get_col_slice(paper_feat,
                                       start_row_idx=0,
                                       end_row_idx=dataset.num_papers,
                                       start_col_idx=i,
                                       end_col_idx=j)

                inputs = paddle.to_tensor(inputs, dtype='float32')
                outputs = author_graph.send_recv(inputs)
                outputs = outputs / degree
                outputs = outputs.astype('float16').numpy()

                del inputs
                save_col_slice(x_src=outputs,
                               x_dst=author_x,
                               start_row_idx=0,
                               end_row_idx=dataset.num_authors,
                               start_col_idx=i,
                               end_col_idx=j)
                del outputs

            author_x.flush()
            del author_x
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')