def prepare_data(self): dataset = MAG240MDataset(self.data_dir) path = f'{dataset.dir}/paper_to_paper_symmetric.pt' if not osp.exists(path): # Will take approximately 5 minutes... t = time.perf_counter() print('Converting adjacency matrix...', end=' ', flush=True) edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor(row=edge_index[0], col=edge_index[1], sparse_sizes=(dataset.num_papers, dataset.num_papers), is_sorted=True) torch.save(adj_t.to_symmetric(), path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/full_adj_t.pt' if not osp.exists(path): # Will take approximately 16 minutes... t = time.perf_counter() print('Merging adjacency matrices...', end=' ', flush=True) row, col, _ = torch.load( f'{dataset.dir}/paper_to_paper_symmetric.pt').coo() rows, cols = [row], [col] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) row += dataset.num_papers rows += [row, col] cols += [col, row] edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors rows += [row, col] cols += [col, row] edge_types = [ torch.full(x.size(), i, dtype=torch.int8) for i, x in enumerate(rows) ] row = torch.cat(rows, dim=0) del rows col = torch.cat(cols, dim=0) del cols N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) perm = (N * row).add_(col).numpy().argsort() perm = torch.from_numpy(perm) row = row[perm] col = col[perm] edge_type = torch.cat(edge_types, dim=0)[perm] del edge_types full_adj_t = SparseTensor(row=row, col=col, value=edge_type, sparse_sizes=(N, N), is_sorted=True) torch.save(full_adj_t, path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/full_feat.npy' done_flag_path = f'{dataset.dir}/full_feat_done.txt' if not osp.exists(done_flag_path): # Will take ~3 hours... t = time.perf_counter() print('Generating full feature matrix...') node_chunk_size = 100000 dim_chunk_size = 64 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') for i in tqdm(range(0, dataset.num_papers, node_chunk_size)): j = min(i + node_chunk_size, dataset.num_papers) x[i:j] = paper_feat[i:j] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=row, col=col, sparse_sizes=(dataset.num_authors, dataset.num_papers), is_sorted=True) # Processing 64-dim subfeatures at a time for memory efficiency. print('Generating author features...') for i in tqdm(range(0, self.num_features, dim_chunk_size)): j = min(i + dim_chunk_size, self.num_features) inputs = get_col_slice(paper_feat, start_row_idx=0, end_row_idx=dataset.num_papers, start_col_idx=i, end_col_idx=j) inputs = torch.from_numpy(inputs) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_src=outputs, x_dst=x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=j) del outputs edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=col, col=row, sparse_sizes=(dataset.num_institutions, dataset.num_authors), is_sorted=False) print('Generating institution features...') # Processing 64-dim subfeatures at a time for memory efficiency. for i in tqdm(range(0, self.num_features, dim_chunk_size)): j = min(i + dim_chunk_size, self.num_features) inputs = get_col_slice(x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=j) inputs = torch.from_numpy(inputs) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_src=outputs, x_dst=x, start_row_idx=dataset.num_papers + dataset.num_authors, end_row_idx=N, start_col_idx=i, end_col_idx=j) del outputs x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') with open(done_flag_path, 'w') as f: f.write('done')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) path = f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt' if not osp.exists(path): # Will take approximately 5 minutes... t = time.perf_counter() print('Converting adjacency matrix...', end=' ', flush=True) edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor(row=edge_index[0], col=edge_index[1], sparse_sizes=(dataset.num_papers, dataset.num_papers), is_sorted=True) torch.save(adj_t.to_symmetric(), path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.root}/mag240m/full_adj_t.pt' if not osp.exists(path): # Will take approximately 16 minutes... t = time.perf_counter() print('Merging adjacency matrices...', end=' ', flush=True) row, col, _ = torch.load( f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt').coo() rows, cols = [row], [col] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) row += dataset.num_papers rows += [row, col] cols += [col, row] edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors rows += [row, col] cols += [col, row] edge_types = [ torch.full(x.size(), i, dtype=torch.int8) for i, x in enumerate(rows) ] row = torch.cat(rows, dim=0) del rows col = torch.cat(cols, dim=0) del cols N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) perm = (N * row).add_(col).numpy().argsort() perm = torch.from_numpy(perm) row = row[perm] col = col[perm] edge_type = torch.cat(edge_types, dim=0)[perm] del edge_types full_adj_t = SparseTensor(row=row, col=col, value=edge_type, sparse_sizes=(N, N), is_sorted=True) torch.save(full_adj_t, path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.root}/mag240m/full_feat.npy' # indicate whether full_feat processing has been finished or not done_flag_path = f'{dataset.root}/mag240m/full_feat_done.txt' if not osp.exists( done_flag_path): # Will take approximately 3 hours... if os.path.exists(path): print('Removing unfinished full_feat.npy') os.remove(path) try: t = time.perf_counter() print('Generating full feature matrix...') N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) paper_feat = dataset.paper_feat dim_chunk = 64 chunk = 100000 print('Copying paper features...') for i in tqdm(range(0, dataset.num_papers, chunk)): # Copy paper features. end_idx = min(i + chunk, dataset.num_papers) x[i:end_idx] = paper_feat[i:end_idx] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=row, col=col, sparse_sizes=(dataset.num_authors, dataset.num_papers), is_sorted=True) print('Generating author features...') # processing 64-dim subfeatures at a time for memory efficiency for i in tqdm(range(0, self.num_features, dim_chunk)): end_idx = min(i + dim_chunk, self.num_features) inputs = torch.from_numpy( get_col_slice(paper_feat, start_row_idx=0, end_row_idx=len(paper_feat), start_col_idx=i, end_col_idx=end_idx)) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_from=outputs, x_to=x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=end_idx) del outputs edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=col, col=row, sparse_sizes=(dataset.num_institutions, dataset.num_authors), is_sorted=False) print('Generating institution features...') # processing 64-dim subfeatures at a time for memory efficiency for i in tqdm(range(0, self.num_features, dim_chunk)): end_idx = min(i + dim_chunk, self.num_features) inputs = torch.from_numpy( get_col_slice(x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=end_idx)) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_from=outputs, x_to=x, start_row_idx=dataset.num_papers + dataset.num_authors, end_row_idx=N, start_col_idx=i, end_col_idx=end_idx) del outputs x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') with open(done_flag_path, 'w') as f: f.write('done') except Exception: traceback.print_exc() if os.path.exists(path): print( 'Removing unfinished full feat file due to exception') os.remove(path) exit(-1)