def prepare_data(self): dataset = MAG240MDataset(self.data_dir) log.info(dataset.num_authors) log.info(dataset.num_institutions) author_path = f'{dataset.dir}/author_feat_year.npy' path = f'{dataset.dir}/institution_feat_year.npy' t = time.perf_counter() if not osp.exists(path): log.info('get institution_feat...') author_feat = np.memmap(author_path, dtype=np.int32, mode='r', shape=(dataset.num_authors, )) author_feat = author_feat[:] author_feat = np.expand_dims(author_feat, axis=1) # author edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T log.info(edge_index.shape) institution_graph = Graph(edge_index, num_nodes=dataset.num_institutions) institution_graph.tensor() log.info('finish institution graph') institution_x = np.memmap(path, dtype=np.int32, mode='w+', shape=(dataset.num_institutions, )) degree = paddle.zeros(shape=[dataset.num_institutions, 1], dtype='float32') temp_one = paddle.ones(shape=[edge_index.shape[0], 1], dtype='float32') degree = scatter(degree, overwrite=False, index=institution_graph.edges[:, 1], updates=temp_one) log.info('finish degree') inputs = author_feat inputs = paddle.to_tensor(inputs, dtype='float32') outputs = institution_graph.send_recv(inputs) outputs = outputs / degree outputs = outputs.astype('int32').numpy() del inputs save_col_slice(x_src=outputs, x_dst=institution_x, start_row_idx=0, end_row_idx=dataset.num_institutions) del outputs institution_x.flush() del institution_x log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) log.info(dataset.num_authors) log.info(dataset.num_papers) path = f'{dataset.dir}/author_feat.npy' t = time.perf_counter() if not osp.exists(path): log.info('get author_feat...') paper_feat = dataset.paper_feat # author edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] edge_index = np.stack([col, row], axis=1) log.info(edge_index.shape) author_graph = Graph(edge_index, num_nodes=dataset.num_authors) author_graph.tensor() log.info('finish author graph') author_x = np.memmap(path, dtype=np.float16, mode='w+', shape=(dataset.num_authors, self.num_features)) dim_chunk_size = 64 degree = paddle.zeros(shape=[dataset.num_authors, 1], dtype='float32') degree += 1e-10 temp_one = paddle.ones(shape=[edge_index.shape[0], 1], dtype='float32') degree = scatter(degree, author_graph.edges[:, 1], temp_one, overwrite=False) log.info('finish degree') for i in tqdm(range(0, self.num_features, dim_chunk_size)): j = min(i + dim_chunk_size, self.num_features) inputs = get_col_slice(paper_feat, start_row_idx=0, end_row_idx=dataset.num_papers, start_col_idx=i, end_col_idx=j) inputs = paddle.to_tensor(inputs, dtype='float32') outputs = author_graph.send_recv(inputs) outputs = outputs / degree outputs = outputs.astype('float16').numpy() del inputs save_col_slice(x_src=outputs, x_dst=author_x, start_row_idx=0, end_row_idx=dataset.num_authors, start_col_idx=i, end_col_idx=j) del outputs author_x.flush() del author_x log.info(f'Done! [{time.perf_counter() - t:.2f}s]')