def setup(self, stage: Optional[str] = None): t = time.perf_counter() print("Reading dataset...", end=" ", flush=True) dataset = MAG240MDataset(self.data_dir) self.train_idx = torch.from_numpy(dataset.get_idx_split("train")) self.train_idx = self.train_idx self.train_idx.share_memory_() self.val_idx = torch.from_numpy(dataset.get_idx_split("valid")) self.val_idx.share_memory_() self.test_idx = torch.from_numpy(dataset.get_idx_split("test")) self.test_idx.share_memory_() N = dataset.num_papers + dataset.num_authors + dataset.num_institutions self.x = np.memmap( f"{dataset.dir}/full_feat.npy", dtype=np.float16, mode="r", shape=(N, self.num_features), ) self.y = torch.from_numpy(dataset.all_paper_label) path = f"{dataset.dir}/full_adj_t.pt" self.adj_t = torch.load(path) print(f"Done! [{time.perf_counter() - t:.2f}s]")
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl' t = time.perf_counter() if not osp.exists(edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) edge_index = np.unique(edge_index, axis=0) graph = Graph(edge_index, sorted=True) graph.adj_dst_index graph.dump(edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') np.random.seed(self.seed) self.train_idx = dataset.get_idx_split('train') np.random.shuffle(self.train_idx) self.val_idx = dataset.get_idx_split('valid') self.test_idx = dataset.get_idx_split('test') self.x = dataset.paper_feat self.y = dataset.all_paper_label self.graph = Graph.load(edge_path, mmap_mode='r+') log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def setup(self, stage: Optional[str] = None): t = time.perf_counter() print('Reading dataset...', end=' ', flush=True) dataset = MAG240MDataset(self.data_dir) self.train_idx = torch.from_numpy(dataset.get_idx_split('train')) self.train_idx = self.train_idx self.train_idx.share_memory_() self.val_idx = torch.from_numpy(dataset.get_idx_split('valid')) self.val_idx.share_memory_() self.test_idx = torch.from_numpy(dataset.get_idx_split('test')) self.test_idx.share_memory_() N = dataset.num_papers + dataset.num_authors + dataset.num_institutions x = np.memmap(f'{dataset.dir}/full_feat.npy', dtype=np.float16, mode='r', shape=(N, self.num_features)) if self.in_memory: self.x = np.empty((N, self.num_features), dtype=np.float16) self.x[:] = x self.x = torch.from_numpy(self.x).share_memory_() else: self.x = x self.y = torch.from_numpy(dataset.all_paper_label) path = f'{dataset.dir}/full_adj_t.pt' self.adj_t = torch.load(path) print(f'Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) log.info(dataset.num_authors) log.info(dataset.num_institutions) author_path = f'{dataset.dir}/author_feat_year.npy' path = f'{dataset.dir}/institution_feat_year.npy' t = time.perf_counter() if not osp.exists(path): log.info('get institution_feat...') author_feat = np.memmap(author_path, dtype=np.int32, mode='r', shape=(dataset.num_authors, )) author_feat = author_feat[:] author_feat = np.expand_dims(author_feat, axis=1) # author edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T log.info(edge_index.shape) institution_graph = Graph(edge_index, num_nodes=dataset.num_institutions) institution_graph.tensor() log.info('finish institution graph') institution_x = np.memmap(path, dtype=np.int32, mode='w+', shape=(dataset.num_institutions, )) degree = paddle.zeros(shape=[dataset.num_institutions, 1], dtype='float32') temp_one = paddle.ones(shape=[edge_index.shape[0], 1], dtype='float32') degree = scatter(degree, overwrite=False, index=institution_graph.edges[:, 1], updates=temp_one) log.info('finish degree') inputs = author_feat inputs = paddle.to_tensor(inputs, dtype='float32') outputs = institution_graph.send_recv(inputs) outputs = outputs / degree outputs = outputs.astype('int32').numpy() del inputs save_col_slice(x_src=outputs, x_dst=institution_x, start_row_idx=0, end_row_idx=dataset.num_institutions) del outputs institution_x.flush() del institution_x log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) path = f'{dataset.dir}/paper_to_paper_symmetric.pt' if not osp.exists(path): t = time.perf_counter() print('Converting adjacency matrix...', end=' ', flush=True) edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor( row=edge_index[0], col=edge_index[1], sparse_sizes=(dataset.num_papers, dataset.num_papers), is_sorted=True) torch.save(adj_t.to_symmetric(), path) print(f'Done! [{time.perf_counter() - t:.2f}s]')
def split_dir(data_dir, output_dit): dataset = MAG240MDataset(data_dir) valid_idx = dataset.get_idx_split("valid") np.random.seed(999) np.random.shuffle(valid_idx) star = 0 end = len(valid_idx) part = len(valid_idx) // 5 + 1 for idx, x in enumerate(range(0, end, part)): y = min(x + part, end) valid_part = valid_idx[x:y] print(valid_part.shape) path_p = f"{output_dit}/valid_{idx}" np.save(path_p, valid_part)
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) log.info(dataset.num_authors) log.info(dataset.num_papers) path = f'{dataset.dir}/author_feat_year.npy' t = time.perf_counter() if not osp.exists(path): log.info('get author_feat...') paper_feat = dataset.all_paper_year paper_feat = np.expand_dims(paper_feat, axis=1) # author edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] edge_index = np.stack([col, row], axis=1) log.info(edge_index.shape) author_graph = BiGraph(edge_index, dst_num_nodes=dataset.num_authors) author_graph.tensor() log.info('finish author graph') author_x_year = np.memmap(path, dtype=np.int32, mode='w+', shape=(dataset.num_authors,)) degree = paddle.zeros(shape=[dataset.num_authors, 1], dtype='float32') degree += 1e-10 temp_one = paddle.ones(shape=[edge_index.shape[0], 1], dtype='float32') degree = scatter(degree, author_graph.edges[:, 1], temp_one, overwrite=False) log.info('finish degree') # inputs = get_col_slice(paper_feat, start_row_idx=0, # end_row_idx=dataset.num_papers) inputs = paper_feat inputs = paddle.to_tensor(inputs, dtype='float32') outputs = author_graph.send_recv(inputs) outputs = outputs / degree outputs = outputs.astype('int32').numpy() del inputs save_col_slice( x_src=outputs, x_dst=author_x_year, start_row_idx=0, end_row_idx=dataset.num_authors) del outputs author_x_year.flush() del author_x_year log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def create_split_dir(source_dir, split_dir): dataset = MAG240MDataset(source_dir) split_dir_exists = os.path.exists(split_dir) if not split_dir_exists: os.mkdir(split_dir) valid_idx = dataset.get_idx_split("valid") np.random.seed(999) np.random.shuffle(valid_idx) end = len(valid_idx) part = len(valid_idx) // 5 + 1 for idx, x in enumerate(range(0, end, part)): y = min(x + part, end) valid_part = valid_idx[x: y] print(valid_part.shape) split_file = f"{split_dir}/valid_{idx}" np.save(split_file, valid_part)
def setup(self, stage: Optional[str] = None): t = time.perf_counter() print('Reading dataset...', end=' ', flush=True) dataset = MAG240MDataset(self.data_dir) self.train_idx = torch.from_numpy(dataset.get_idx_split('train')) self.train_idx = self.train_idx self.train_idx.share_memory_() self.val_idx = torch.from_numpy(dataset.get_idx_split('valid')) self.val_idx.share_memory_() self.test_idx = torch.from_numpy(dataset.get_idx_split('test')) self.test_idx.share_memory_() self.x = dataset.paper_feat self.y = torch.from_numpy(dataset.all_paper_label) path = f'{dataset.dir}/paper_to_paper_symmetric.pt' self.adj_t = torch.load(path) print(f'Done! [{time.perf_counter() - t:.2f}s]')
def get_result(config, eval_all=False): dataset = MAG240MDataset(config.data_dir) evaluator = MAG240MEvaluator() file = 'model_result_temp' sudo_label = np.memmap(file, dtype=np.float32, mode='r', shape=(121751666, 153)) file = "ck_result.txt" wf = open(file, 'a', encoding='utf-8') label = dataset.all_paper_label if eval_all: valid_idx = dataset.get_idx_split('valid') pred = sudo_label[valid_idx] save_path = os.path.join(config.valid_path, "all_eval_result") np.save(save_path, pred) y_pred = pred.argmax(1) y_true = label[valid_idx] valid_acc = evaluator.eval({'y_true': y_true, 'y_pred': y_pred})['acc'] print("all eval result\n") print(f"valid_acc: {valid_acc}\n") wf.write("all eval result\n") wf.write(f"valid_acc: {valid_acc}\n") else: valid_path = os.path.join(config.valid_path, config.valid_name) valid_idx = np.load(valid_path) test_idx = dataset.get_idx_split('test') pred = sudo_label[valid_idx] y_pred = pred.argmax(1) y_true = label[valid_idx] valid_acc = evaluator.eval({'y_true': y_true, 'y_pred': y_pred})['acc'] print(f"eval cv {config.valid_name} result\n") print(f"valid_acc: {valid_acc}\n") wf.write(f"eval cv {config.valid_name} result\n") wf.write(f"valid_acc: {valid_acc}\n") save_path_test = os.path.join(config.valid_path, config.test_name) pred_test = sudo_label[test_idx] print(pred_test.shape) np.save(save_path_test, pred_test)
from tqdm import tqdm import torch import numpy as np from torch_sparse import SparseTensor from torch_geometric.nn.conv.gcn_conv import gcn_norm from ogb.lsc import MAG240MDataset from root import ROOT if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--num_layers', type=int, default=3), args = parser.parse_args() print(args) dataset = MAG240MDataset(ROOT) t = time.perf_counter() print('Reading adjacency matrix...', end=' ', flush=True) path = dataset.root + '/mag240m/paper_to_paper_symmetric_gcn.pt' if osp.exists(path): adj_t = torch.load(path) else: path_sym = dataset.root + '/mag240m/paper_to_paper_symmetric.pt' if osp.exists(path_sym): adj_t = torch.load(path_sym) else: edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor(row=edge_index[0], col=edge_index[1],
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) graph_file_list = [] paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split' graph_file_list.append(paper_edge_path) t = time.perf_counter() if not osp.exists(paper_edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) edge_types = np.full([ edge_index.shape[0], ], 0, dtype='int32') graph = Graph(edge_index, num_nodes=dataset.num_papers, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(paper_edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 1, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 2, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 3, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 4, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) path = f'{dataset.dir}/full_feat.npy' author_feat_path = f'{dataset.dir}/author_feat.npy' institution_feat_path = f'{dataset.dir}/institution_feat.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full feature matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat author_feat = np.memmap(author_feat_path, dtype=np.float16, shape=(dataset.num_authors, self.num_features), mode='r') institution_feat = np.memmap(institution_feat_path, dtype=np.float16, shape=(dataset.num_institutions, self.num_features), mode='r') x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_feat[i:j] del paper_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_feat[i - start_idx:j - start_idx] del author_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_feat[i - start_idx:j - start_idx] del institution_feat x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') np.random.seed(self.seed) self.train_idx = dataset.get_idx_split('train') self.val_idx = dataset.get_idx_split('valid') valid_name = os.path.join(self.valid_path, self.valid_name) self.val_idx_cv = np.load(valid_name) log.info(self.train_idx.shape) log.info(self.val_idx.shape) log.info(self.val_idx_cv.shape) self.test_idx = dataset.get_idx_split('test') ##self.val_idx = np.load('valid_idx_eval.npy') def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): def cal_angle(position, hid_idx): return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) def get_posi_angle_vec(position): return [cal_angle(position, hid_j) for hid_j in range(d_hid)] sinusoid_table = np.array( [get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 return sinusoid_table N = dataset.num_papers + dataset.num_authors + dataset.num_institutions self.x = np.memmap(f'{dataset.dir}/full_feat.npy', dtype=np.float16, mode='r', shape=(N, self.num_features)) self.id_x = np.memmap(f'{dataset.dir}/{self.m2v_file}', dtype=np.float16, mode='r', shape=(N, self.m2v_dim)) self.y = dataset.all_paper_label self.graph = [ Graph.load(edge_path, mmap_mode='r+') for edge_path in graph_file_list ] self.pos = get_sinusoid_encoding_table(200, 768) #self.year = dataset.all_paper_year year_file = f'{dataset.dir}/all_feat_year.npy' self.year = np.memmap(year_file, dtype=np.int32, mode='r', shape=(N, )) self.num_papers = dataset.num_papers self.train_idx_label = None self.train_idx_data = None log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
help='Path to the features of all nodes.') parser.add_argument('--epochs', type=int, default=100, help='Number of epochs.') parser.add_argument('--model-path', type=str, default='./model.pt', help='Path to store the best model.') parser.add_argument('--submission-path', type=str, default='./results', help='Submission directory.') args = parser.parse_args() dataset = MAG240MDataset(root=args.rootdir) print('Loading graph') (g, ), _ = dgl.load_graphs(args.graph_path) g = g.formats(['csc']) print('Loading features') paper_offset = dataset.num_authors + dataset.num_institutions num_nodes = paper_offset + dataset.num_papers num_features = dataset.num_paper_features feats = np.memmap(args.full_feature_path, mode='r', dtype='float16', shape=(num_nodes, num_features)) if args.epochs != 0:
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) graph_file_list = [] paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split' graph_file_list.append(paper_edge_path) t = time.perf_counter() if not osp.exists(paper_edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) edge_types = np.full([ edge_index.shape[0], ], 0, dtype='int32') graph = Graph(edge_index, num_nodes=dataset.num_papers, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(paper_edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 1, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 2, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 3, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 4, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) path = f'{dataset.dir}/full_feat.npy' author_feat_path = f'{dataset.dir}/author_feat.npy' institution_feat_path = f'{dataset.dir}/institution_feat.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full feature matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat author_feat = np.memmap(author_feat_path, dtype=np.float16, shape=(dataset.num_authors, self.num_features), mode='r') institution_feat = np.memmap(institution_feat_path, dtype=np.float16, shape=(dataset.num_institutions, self.num_features), mode='r') x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_feat[i:j] del paper_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_feat[i - start_idx:j - start_idx] del author_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_feat[i - start_idx:j - start_idx] del institution_feat x.flush() del x print(f'feature x Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/all_feat_year.npy' author_year_path = f'{dataset.dir}/author_feat_year.npy' institution_year_path = f'{dataset.dir}/institution_feat_year.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full year matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_year_feat = dataset.all_paper_year author_year_feat = np.memmap(author_year_path, dtype=np.int32, shape=(dataset.num_authors), mode='r') institution_year_feat = np.memmap(institution_year_path, dtype=np.int32, shape=(dataset.num_institutions), mode='r') x = np.memmap(path, dtype=np.int32, mode='w+', shape=(N)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_year_feat[i:j] del paper_year_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_year_feat[i - start_idx:j - start_idx] del author_year_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_year_feat[i - start_idx:j - start_idx] del institution_year_feat x.flush() del x print(f'year feature Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) log.info(dataset.num_authors) log.info(dataset.num_institutions) author_path = f'{dataset.dir}/author_feat.npy' path = f'{dataset.dir}/institution_feat.npy' t = time.perf_counter() if not osp.exists(path): log.info('get institution_feat...') author_feat = np.memmap(author_path, dtype=np.float16, shape=(dataset.num_authors, self.num_features), mode='r') # author edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T log.info(edge_index.shape) institution_graph = BiGraph(edge_index, dst_num_nodes=dataset.num_institutions) institution_graph.tensor() log.info('finish institution graph') institution_x = np.memmap(path, dtype=np.float16, mode='w+', shape=(dataset.num_institutions, self.num_features)) dim_chunk_size = 64 degree = paddle.zeros(shape=[dataset.num_institutions, 1], dtype='float32') temp_one = paddle.ones(shape=[edge_index.shape[0], 1], dtype='float32') degree = scatter(degree, overwrite=False, index=institution_graph.edges[:, 1], updates=temp_one) log.info('finish degree') for i in tqdm(range(0, self.num_features, dim_chunk_size)): j = min(i + dim_chunk_size, self.num_features) inputs = get_col_slice(author_feat, start_row_idx=0, end_row_idx=dataset.num_authors, start_col_idx=i, end_col_idx=j) inputs = paddle.to_tensor(inputs, dtype='float32') outputs = institution_graph.send_recv(inputs) outputs = outputs / degree outputs = outputs.astype('float16').numpy() del inputs save_col_slice(x_src=outputs, x_dst=institution_x, start_row_idx=0, end_row_idx=dataset.num_institutions, start_col_idx=i, end_col_idx=j) del outputs institution_x.flush() del institution_x log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl' t = time.perf_counter() if not osp.exists(paper_edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) # edge_index = np.unique(edge_index, axis=0) graph = Graph(edge_index) graph.adj_dst_index graph.dump(paper_edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') edge_path = f'{dataset.dir}/full_edge_symmetric_pgl' t = time.perf_counter() if not osp.exists(edge_path): log.info('Converting adjacency matrix...') # paper log.info('adding paper edges') paper_graph = Graph.load(paper_edge_path, mmap_mode='r+') rows, cols = [paper_graph.edges[:, 0]], [paper_graph.edges[:, 1]] # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] row += dataset.num_papers rows += [row, col] cols += [col, row] # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] row += dataset.num_papers col += dataset.num_papers + dataset.num_authors rows += [row, col] cols += [col, row] # edge_type log.info('building edge type') edge_types = [ np.full(x.shape, i, dtype='int32') for i, x in enumerate(rows) ] edge_types = np.concatenate(edge_types, axis=0) log.info('building edges') row = np.concatenate(rows, axis=0) del rows col = np.concatenate(cols, axis=0) del cols edge_index = np.stack([row, col], axis=1) N = dataset.num_papers + dataset.num_authors + dataset.num_institutions full_graph = Graph(edge_index, num_nodes=N, edge_feat={'edge_type': edge_types}) full_graph.adj_dst_index full_graph.dump(edge_path) log.info( f'Done! finish full_edge [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/full_feat.npy' author_feat_path = f'{dataset.dir}/author_feat.npy' institution_feat_path = f'{dataset.dir}/institution_feat.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full feature matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat author_feat = np.memmap(author_feat_path, dtype=np.float16, shape=(dataset.num_authors, self.num_features), mode='r') institution_feat = np.memmap(institution_feat_path, dtype=np.float16, shape=(dataset.num_institutions, self.num_features), mode='r') x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_feat[i:j] del paper_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_feat[i - start_idx:j - start_idx] del author_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_feat[i - start_idx:j - start_idx] del institution_feat x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') np.random.seed(self.seed) self.train_idx = dataset.get_idx_split('train') np.random.shuffle(self.train_idx) self.val_idx = dataset.get_idx_split('valid') self.test_idx = dataset.get_idx_split('test') N = dataset.num_papers + dataset.num_authors + dataset.num_institutions self.x = np.memmap(f'{dataset.dir}/full_feat.npy', dtype=np.float16, mode='r', shape=(N, self.num_features)) self.y = dataset.all_paper_label self.graph = Graph.load(edge_path, mmap_mode='r+') self.graph._edge_feat['edge_type'] = self.graph._edge_feat[ 'edge_type'].astype('int32') log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
parser.add_argument('--device', type=int, default=0) parser.add_argument('--hidden_channels', type=int, default=512) parser.add_argument('--num_layers', type=int, default=2), parser.add_argument('--no_batch_norm', action='store_true') parser.add_argument('--relu_last', action='store_true') parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--batch_size', type=int, default=380000) parser.add_argument('--epochs', type=int, default=1000) args = parser.parse_args() print(args) torch.manual_seed(12345) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' dataset = MAG240MDataset('~/datasets/OGB') evaluator = MAG240MEvaluator() train_idx = dataset.get_idx_split('train') valid_idx = dataset.get_idx_split('valid') t = time.perf_counter() print('Reading training node features...', end=' ', flush=True) x_train = dataset.paper_feat[train_idx] x_train = torch.from_numpy(x_train).to(torch.float).to(device) print(f'Done! [{time.perf_counter() - t:.2f}s]') t = time.perf_counter() print('Reading validation node features...', end=' ', flush=True) x_valid = dataset.paper_feat[valid_idx] x_valid = torch.from_numpy(x_valid).to(torch.float).to(device) print(f'Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) path = f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt' if not osp.exists(path): # Will take approximately 5 minutes... t = time.perf_counter() print('Converting adjacency matrix...', end=' ', flush=True) edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor(row=edge_index[0], col=edge_index[1], sparse_sizes=(dataset.num_papers, dataset.num_papers), is_sorted=True) torch.save(adj_t.to_symmetric(), path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.root}/mag240m/full_adj_t.pt' if not osp.exists(path): # Will take approximately 16 minutes... t = time.perf_counter() print('Merging adjacency matrices...', end=' ', flush=True) row, col, _ = torch.load( f'{dataset.root}/mag240m/paper_to_paper_symmetric.pt').coo() rows, cols = [row], [col] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) row += dataset.num_papers rows += [row, col] cols += [col, row] edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors rows += [row, col] cols += [col, row] edge_types = [ torch.full(x.size(), i, dtype=torch.int8) for i, x in enumerate(rows) ] row = torch.cat(rows, dim=0) del rows col = torch.cat(cols, dim=0) del cols N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) perm = (N * row).add_(col).numpy().argsort() perm = torch.from_numpy(perm) row = row[perm] col = col[perm] edge_type = torch.cat(edge_types, dim=0)[perm] del edge_types full_adj_t = SparseTensor(row=row, col=col, value=edge_type, sparse_sizes=(N, N), is_sorted=True) torch.save(full_adj_t, path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.root}/mag240m/full_feat.npy' # indicate whether full_feat processing has been finished or not done_flag_path = f'{dataset.root}/mag240m/full_feat_done.txt' if not osp.exists( done_flag_path): # Will take approximately 3 hours... if os.path.exists(path): print('Removing unfinished full_feat.npy') os.remove(path) try: t = time.perf_counter() print('Generating full feature matrix...') N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) paper_feat = dataset.paper_feat dim_chunk = 64 chunk = 100000 print('Copying paper features...') for i in tqdm(range(0, dataset.num_papers, chunk)): # Copy paper features. end_idx = min(i + chunk, dataset.num_papers) x[i:end_idx] = paper_feat[i:end_idx] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=row, col=col, sparse_sizes=(dataset.num_authors, dataset.num_papers), is_sorted=True) print('Generating author features...') # processing 64-dim subfeatures at a time for memory efficiency for i in tqdm(range(0, self.num_features, dim_chunk)): end_idx = min(i + dim_chunk, self.num_features) inputs = torch.from_numpy( get_col_slice(paper_feat, start_row_idx=0, end_row_idx=len(paper_feat), start_col_idx=i, end_col_idx=end_idx)) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_from=outputs, x_to=x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=end_idx) del outputs edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=col, col=row, sparse_sizes=(dataset.num_institutions, dataset.num_authors), is_sorted=False) print('Generating institution features...') # processing 64-dim subfeatures at a time for memory efficiency for i in tqdm(range(0, self.num_features, dim_chunk)): end_idx = min(i + dim_chunk, self.num_features) inputs = torch.from_numpy( get_col_slice(x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=end_idx)) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_from=outputs, x_to=x, start_row_idx=dataset.num_papers + dataset.num_authors, end_row_idx=N, start_col_idx=i, end_col_idx=end_idx) del outputs x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') with open(done_flag_path, 'w') as f: f.write('done') except Exception: traceback.print_exc() if os.path.exists(path): print( 'Removing unfinished full feat file due to exception') os.remove(path) exit(-1)
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) path = f'{dataset.dir}/paper_to_paper_symmetric.pt' if not osp.exists(path): # Will take approximately 5 minutes... t = time.perf_counter() print('Converting adjacency matrix...', end=' ', flush=True) edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = torch.from_numpy(edge_index) adj_t = SparseTensor(row=edge_index[0], col=edge_index[1], sparse_sizes=(dataset.num_papers, dataset.num_papers), is_sorted=True) torch.save(adj_t.to_symmetric(), path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/full_adj_t.pt' if not osp.exists(path): # Will take approximately 16 minutes... t = time.perf_counter() print('Merging adjacency matrices...', end=' ', flush=True) row, col, _ = torch.load( f'{dataset.dir}/paper_to_paper_symmetric.pt').coo() rows, cols = [row], [col] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) row += dataset.num_papers rows += [row, col] cols += [col, row] edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors rows += [row, col] cols += [col, row] edge_types = [ torch.full(x.size(), i, dtype=torch.int8) for i, x in enumerate(rows) ] row = torch.cat(rows, dim=0) del rows col = torch.cat(cols, dim=0) del cols N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) perm = (N * row).add_(col).numpy().argsort() perm = torch.from_numpy(perm) row = row[perm] col = col[perm] edge_type = torch.cat(edge_types, dim=0)[perm] del edge_types full_adj_t = SparseTensor(row=row, col=col, value=edge_type, sparse_sizes=(N, N), is_sorted=True) torch.save(full_adj_t, path) print(f'Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/full_feat.npy' done_flag_path = f'{dataset.dir}/full_feat_done.txt' if not osp.exists(done_flag_path): # Will take ~3 hours... t = time.perf_counter() print('Generating full feature matrix...') node_chunk_size = 100000 dim_chunk_size = 64 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') for i in tqdm(range(0, dataset.num_papers, node_chunk_size)): j = min(i + node_chunk_size, dataset.num_papers) x[i:j] = paper_feat[i:j] edge_index = dataset.edge_index('author', 'writes', 'paper') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=row, col=col, sparse_sizes=(dataset.num_authors, dataset.num_papers), is_sorted=True) # Processing 64-dim subfeatures at a time for memory efficiency. print('Generating author features...') for i in tqdm(range(0, self.num_features, dim_chunk_size)): j = min(i + dim_chunk_size, self.num_features) inputs = get_col_slice(paper_feat, start_row_idx=0, end_row_idx=dataset.num_papers, start_col_idx=i, end_col_idx=j) inputs = torch.from_numpy(inputs) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_src=outputs, x_dst=x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=j) del outputs edge_index = dataset.edge_index('author', 'institution') row, col = torch.from_numpy(edge_index) adj_t = SparseTensor(row=col, col=row, sparse_sizes=(dataset.num_institutions, dataset.num_authors), is_sorted=False) print('Generating institution features...') # Processing 64-dim subfeatures at a time for memory efficiency. for i in tqdm(range(0, self.num_features, dim_chunk_size)): j = min(i + dim_chunk_size, self.num_features) inputs = get_col_slice(x, start_row_idx=dataset.num_papers, end_row_idx=dataset.num_papers + dataset.num_authors, start_col_idx=i, end_col_idx=j) inputs = torch.from_numpy(inputs) outputs = adj_t.matmul(inputs, reduce='mean').numpy() del inputs save_col_slice(x_src=outputs, x_dst=x, start_row_idx=dataset.num_papers + dataset.num_authors, end_row_idx=N, start_col_idx=i, end_col_idx=j) del outputs x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') with open(done_flag_path, 'w') as f: f.write('done')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) log.info(dataset.num_authors) log.info(dataset.num_papers) path = f'{dataset.dir}/author_feat.npy' t = time.perf_counter() if not osp.exists(path): log.info('get author_feat...') paper_feat = dataset.paper_feat # author edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] edge_index = np.stack([col, row], axis=1) log.info(edge_index.shape) author_graph = Graph(edge_index, num_nodes=dataset.num_authors) author_graph.tensor() log.info('finish author graph') author_x = np.memmap(path, dtype=np.float16, mode='w+', shape=(dataset.num_authors, self.num_features)) dim_chunk_size = 64 degree = paddle.zeros(shape=[dataset.num_authors, 1], dtype='float32') degree += 1e-10 temp_one = paddle.ones(shape=[edge_index.shape[0], 1], dtype='float32') degree = scatter(degree, author_graph.edges[:, 1], temp_one, overwrite=False) log.info('finish degree') for i in tqdm(range(0, self.num_features, dim_chunk_size)): j = min(i + dim_chunk_size, self.num_features) inputs = get_col_slice(paper_feat, start_row_idx=0, end_row_idx=dataset.num_papers, start_col_idx=i, end_col_idx=j) inputs = paddle.to_tensor(inputs, dtype='float32') outputs = author_graph.send_recv(inputs) outputs = outputs / degree outputs = outputs.astype('float16').numpy() del inputs save_col_slice(x_src=outputs, x_dst=author_x, start_row_idx=0, end_row_idx=dataset.num_authors, start_col_idx=i, end_col_idx=j) del outputs author_x.flush() del author_x log.info(f'Done! [{time.perf_counter() - t:.2f}s]')