def prepare_data(self): dataset = MAG240MDataset(self.data_dir) path = f'{dataset.dir}/paper_to_paper_symmetric.pt' if not osp.exists(path): t = time.perf_counter() print('Converting adjacency matrix...', end=' ', flush=True) edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor( row=edge_index[0], col=edge_index[1], sparse_sizes=(dataset.num_papers, dataset.num_papers), is_sorted=True) torch.save(adj_t.to_symmetric(), path) print(f'Done! [{time.perf_counter() - t:.2f}s]')
def test_to_symmetric(device): row = torch.tensor([0, 0, 0, 1, 1], device=device) col = torch.tensor([0, 1, 2, 0, 2], device=device) value = torch.arange(1, 6, device=device) mat = SparseTensor(row=row, col=col, value=value) assert not mat.is_symmetric() mat = mat.to_symmetric() assert mat.is_symmetric() assert mat.to_dense().tolist() == [ [2, 6, 3], [6, 0, 5], [3, 5, 0], ]
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) path = f'{dataset.dir}/paper_to_paper_symmetric.pt' if not osp.exists(path): # Will take approximately 5 minutes... t = time.perf_counter() print('Converting adjacency matrix...', end=' ', flush=True) edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor(row=edge_index[0], col=edge_index[1], sparse_sizes=(dataset.num_papers, dataset.num_papers), is_sorted=True) torch.save(adj_t.to_symmetric(), path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/full_adj_t.pt' if not osp.exists(path): # Will take approximately 16 minutes... t = time.perf_counter() print('Merging adjacency matrices...', end=' ', flush=True) row, col, _ = torch.load( f'{dataset.dir}/paper_to_paper_symmetric.pt').coo() rows, cols = [row], [col] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) row += dataset.num_papers rows += [row, col] cols += [col, row] edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors rows += [row, col] cols += [col, row] edge_types = [ torch.full(x.size(), i, dtype=torch.int8) for i, x in enumerate(rows) ] row = torch.cat(rows, dim=0) del rows col = torch.cat(cols, dim=0) del cols N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) perm = (N * row).add_(col).numpy().argsort() perm = torch.from_numpy(perm) row = row[perm] col = col[perm] edge_type = torch.cat(edge_types, dim=0)[perm] del edge_types full_adj_t = SparseTensor(row=row, col=col, value=edge_type, sparse_sizes=(N, N), is_sorted=True) torch.save(full_adj_t, path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/full_feat.npy' done_flag_path = f'{dataset.dir}/full_feat_done.txt' if not osp.exists(done_flag_path): # Will take ~3 hours... t = time.perf_counter() print('Generating full feature matrix...') node_chunk_size = 100000 dim_chunk_size = 64 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') for i in tqdm(range(0, dataset.num_papers, node_chunk_size)): j = min(i + node_chunk_size, dataset.num_papers) x[i:j] = paper_feat[i:j] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=row, col=col, sparse_sizes=(dataset.num_authors, dataset.num_papers), is_sorted=True) # Processing 64-dim subfeatures at a time for memory efficiency. print('Generating author features...') for i in tqdm(range(0, self.num_features, dim_chunk_size)): j = min(i + dim_chunk_size, self.num_features) inputs = get_col_slice(paper_feat, start_row_idx=0, end_row_idx=dataset.num_papers, start_col_idx=i, end_col_idx=j) inputs = torch.from_numpy(inputs) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_src=outputs, x_dst=x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=j) del outputs edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=col, col=row, sparse_sizes=(dataset.num_institutions, dataset.num_authors), is_sorted=False) print('Generating institution features...') # Processing 64-dim subfeatures at a time for memory efficiency. for i in tqdm(range(0, self.num_features, dim_chunk_size)): j = min(i + dim_chunk_size, self.num_features) inputs = get_col_slice(x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=j) inputs = torch.from_numpy(inputs) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_src=outputs, x_dst=x, start_row_idx=dataset.num_papers + dataset.num_authors, end_row_idx=N, start_col_idx=i, end_col_idx=j) del outputs x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') with open(done_flag_path, 'w') as f: f.write('done')
path = dataset.root + '/mag240m/paper_to_paper_symmetric_gcn.pt' if osp.exists(path): adj_t = torch.load(path) else: path_sym = dataset.root + '/mag240m/paper_to_paper_symmetric.pt' if osp.exists(path_sym): adj_t = torch.load(path_sym) else: edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor(row=edge_index[0], col=edge_index[1], sparse_sizes=(dataset.num_papers, dataset.num_papers), is_sorted=True) adj_t = adj_t.to_symmetric() torch.save(adj_t, path_sym) adj_t = gcn_norm(adj_t, add_self_loops=True) torch.save(adj_t, path) print(f'Done! [{time.perf_counter() - t:.2f}s]') train_idx = dataset.get_idx_split('train') valid_idx = dataset.get_idx_split('valid') test_idx = dataset.get_idx_split('test') num_features = dataset.num_paper_features pbar = tqdm(total=args.num_layers * (num_features // 128)) pbar.set_description('Pre-processing node features') for j in range(0, num_features, 128): # Run spmm in chunks... x = dataset.paper_feat[:, j:min(j + 128, num_features)]
def main(): parser = argparse.ArgumentParser(description='OGBN-MAG (Full-Batch)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--hidden_channels', type=int, default=64) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-mag') split_idx = dataset.get_idx_split() data = dataset[0] # We do not consider those attributes for now. data.node_year_dict = None data.edge_reltype_dict = None print(data) # Convert to new transposed `SparseTensor` format and add reverse edges. data.adj_t_dict = {} for keys, (row, col) in data.edge_index_dict.items(): sizes = (data.num_nodes_dict[keys[0]], data.num_nodes_dict[keys[2]]) adj = SparseTensor(row=row, col=col, sparse_sizes=sizes) # adj = SparseTensor(row=row, col=col)[:sizes[0], :sizes[1]] # TEST if keys[0] != keys[2]: data.adj_t_dict[keys] = adj.t() data.adj_t_dict[(keys[2], 'to', keys[0])] = adj else: data.adj_t_dict[keys] = adj.to_symmetric() data.edge_index_dict = None x_types = list(data.x_dict.keys()) edge_types = list(data.adj_t_dict.keys()) model = RGCN(data.x_dict['paper'].size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, data.num_nodes_dict, x_types, edge_types) data = data.to(device) model = model.to(device) train_idx = split_idx['train']['paper'].to(device) evaluator = Evaluator(name='ogbn-mag') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, data.x_dict, data.adj_t_dict, data.y_dict['paper'], train_idx, optimizer) result = test(model, data.x_dict, data.adj_t_dict, data.y_dict['paper'], split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) path = f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt' if not osp.exists(path): # Will take approximately 5 minutes... t = time.perf_counter() print('Converting adjacency matrix...', end=' ', flush=True) edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor(row=edge_index[0], col=edge_index[1], sparse_sizes=(dataset.num_papers, dataset.num_papers), is_sorted=True) torch.save(adj_t.to_symmetric(), path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.root}/mag240m/full_adj_t.pt' if not osp.exists(path): # Will take approximately 16 minutes... t = time.perf_counter() print('Merging adjacency matrices...', end=' ', flush=True) row, col, _ = torch.load( f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt').coo() rows, cols = [row], [col] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) row += dataset.num_papers rows += [row, col] cols += [col, row] edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors rows += [row, col] cols += [col, row] edge_types = [ torch.full(x.size(), i, dtype=torch.int8) for i, x in enumerate(rows) ] row = torch.cat(rows, dim=0) del rows col = torch.cat(cols, dim=0) del cols N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) perm = (N * row).add_(col).numpy().argsort() perm = torch.from_numpy(perm) row = row[perm] col = col[perm] edge_type = torch.cat(edge_types, dim=0)[perm] del edge_types full_adj_t = SparseTensor(row=row, col=col, value=edge_type, sparse_sizes=(N, N), is_sorted=True) torch.save(full_adj_t, path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.root}/mag240m/full_feat.npy' # indicate whether full_feat processing has been finished or not done_flag_path = f'{dataset.root}/mag240m/full_feat_done.txt' if not osp.exists( done_flag_path): # Will take approximately 3 hours... if os.path.exists(path): print('Removing unfinished full_feat.npy') os.remove(path) try: t = time.perf_counter() print('Generating full feature matrix...') N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) paper_feat = dataset.paper_feat dim_chunk = 64 chunk = 100000 print('Copying paper features...') for i in tqdm(range(0, dataset.num_papers, chunk)): # Copy paper features. end_idx = min(i + chunk, dataset.num_papers) x[i:end_idx] = paper_feat[i:end_idx] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=row, col=col, sparse_sizes=(dataset.num_authors, dataset.num_papers), is_sorted=True) print('Generating author features...') # processing 64-dim subfeatures at a time for memory efficiency for i in tqdm(range(0, self.num_features, dim_chunk)): end_idx = min(i + dim_chunk, self.num_features) inputs = torch.from_numpy( get_col_slice(paper_feat, start_row_idx=0, end_row_idx=len(paper_feat), start_col_idx=i, end_col_idx=end_idx)) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_from=outputs, x_to=x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=end_idx) del outputs edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=col, col=row, sparse_sizes=(dataset.num_institutions, dataset.num_authors), is_sorted=False) print('Generating institution features...') # processing 64-dim subfeatures at a time for memory efficiency for i in tqdm(range(0, self.num_features, dim_chunk)): end_idx = min(i + dim_chunk, self.num_features) inputs = torch.from_numpy( get_col_slice(x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=end_idx)) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_from=outputs, x_to=x, start_row_idx=dataset.num_papers + dataset.num_authors, end_row_idx=N, start_col_idx=i, end_col_idx=end_idx) del outputs x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') with open(done_flag_path, 'w') as f: f.write('done') except Exception: traceback.print_exc() if os.path.exists(path): print( 'Removing unfinished full feat file due to exception') os.remove(path) exit(-1)