def prepare_data(self): dataset = MAG240MDataset(self.data_dir) edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl' t = time.perf_counter() if not osp.exists(edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) edge_index = np.unique(edge_index, axis=0) graph = Graph(edge_index, sorted=True) graph.adj_dst_index graph.dump(edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') np.random.seed(self.seed) self.train_idx = dataset.get_idx_split('train') np.random.shuffle(self.train_idx) self.val_idx = dataset.get_idx_split('valid') self.test_idx = dataset.get_idx_split('test') self.x = dataset.paper_feat self.y = dataset.all_paper_label self.graph = Graph.load(edge_path, mmap_mode='r+') log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) graph_file_list = [] paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split' graph_file_list.append(paper_edge_path) t = time.perf_counter() if not osp.exists(paper_edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) edge_types = np.full([ edge_index.shape[0], ], 0, dtype='int32') graph = Graph(edge_index, num_nodes=dataset.num_papers, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(paper_edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 1, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 2, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 3, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 4, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) path = f'{dataset.dir}/full_feat.npy' author_feat_path = f'{dataset.dir}/author_feat.npy' institution_feat_path = f'{dataset.dir}/institution_feat.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full feature matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat author_feat = np.memmap(author_feat_path, dtype=np.float16, shape=(dataset.num_authors, self.num_features), mode='r') institution_feat = np.memmap(institution_feat_path, dtype=np.float16, shape=(dataset.num_institutions, self.num_features), mode='r') x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_feat[i:j] del paper_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_feat[i - start_idx:j - start_idx] del author_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_feat[i - start_idx:j - start_idx] del institution_feat x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') np.random.seed(self.seed) self.train_idx = dataset.get_idx_split('train') self.val_idx = dataset.get_idx_split('valid') valid_name = os.path.join(self.valid_path, self.valid_name) self.val_idx_cv = np.load(valid_name) log.info(self.train_idx.shape) log.info(self.val_idx.shape) log.info(self.val_idx_cv.shape) self.test_idx = dataset.get_idx_split('test') ##self.val_idx = np.load('valid_idx_eval.npy') def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): def cal_angle(position, hid_idx): return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) def get_posi_angle_vec(position): return [cal_angle(position, hid_j) for hid_j in range(d_hid)] sinusoid_table = np.array( [get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 return sinusoid_table N = dataset.num_papers + dataset.num_authors + dataset.num_institutions self.x = np.memmap(f'{dataset.dir}/full_feat.npy', dtype=np.float16, mode='r', shape=(N, self.num_features)) self.id_x = np.memmap(f'{dataset.dir}/{self.m2v_file}', dtype=np.float16, mode='r', shape=(N, self.m2v_dim)) self.y = dataset.all_paper_label self.graph = [ Graph.load(edge_path, mmap_mode='r+') for edge_path in graph_file_list ] self.pos = get_sinusoid_encoding_table(200, 768) #self.year = dataset.all_paper_year year_file = f'{dataset.dir}/all_feat_year.npy' self.year = np.memmap(year_file, dtype=np.int32, mode='r', shape=(N, )) self.num_papers = dataset.num_papers self.train_idx_label = None self.train_idx_data = None log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) graph_file_list = [] paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split' graph_file_list.append(paper_edge_path) t = time.perf_counter() if not osp.exists(paper_edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) edge_types = np.full([ edge_index.shape[0], ], 0, dtype='int32') graph = Graph(edge_index, num_nodes=dataset.num_papers, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(paper_edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 1, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst' graph_file_list.append(author_edge_path) t = time.perf_counter() if not osp.exists(author_edge_path): log.info('Converting author matrix...') # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers edge_types = np.full(row.shape, 2, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(author_edge_path) log.info( f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]') institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 3, dtype='int32') edge_index = np.stack([row, col], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst' graph_file_list.append(institution_edge_path) t = time.perf_counter() if not osp.exists(institution_edge_path): log.info('Converting institution matrix...') # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] log.info(row[:10]) row += dataset.num_papers col += dataset.num_papers + dataset.num_authors # edge_type log.info('building edge type') edge_types = np.full(row.shape, 4, dtype='int32') edge_index = np.stack([col, row], axis=1) graph = Graph(edge_index, edge_feat={'edge_type': edge_types}) graph.adj_dst_index graph.dump(institution_edge_path) log.info( f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]' ) path = f'{dataset.dir}/full_feat.npy' author_feat_path = f'{dataset.dir}/author_feat.npy' institution_feat_path = f'{dataset.dir}/institution_feat.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full feature matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat author_feat = np.memmap(author_feat_path, dtype=np.float16, shape=(dataset.num_authors, self.num_features), mode='r') institution_feat = np.memmap(institution_feat_path, dtype=np.float16, shape=(dataset.num_institutions, self.num_features), mode='r') x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_feat[i:j] del paper_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_feat[i - start_idx:j - start_idx] del author_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_feat[i - start_idx:j - start_idx] del institution_feat x.flush() del x print(f'feature x Done! [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/all_feat_year.npy' author_year_path = f'{dataset.dir}/author_feat_year.npy' institution_year_path = f'{dataset.dir}/institution_feat_year.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full year matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_year_feat = dataset.all_paper_year author_year_feat = np.memmap(author_year_path, dtype=np.int32, shape=(dataset.num_authors), mode='r') institution_year_feat = np.memmap(institution_year_path, dtype=np.int32, shape=(dataset.num_institutions), mode='r') x = np.memmap(path, dtype=np.int32, mode='w+', shape=(N)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_year_feat[i:j] del paper_year_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_year_feat[i - start_idx:j - start_idx] del author_year_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_year_feat[i - start_idx:j - start_idx] del institution_year_feat x.flush() del x print(f'year feature Done! [{time.perf_counter() - t:.2f}s]')
def prepare_data(self): dataset = MAG240MDataset(self.data_dir) paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl' t = time.perf_counter() if not osp.exists(paper_edge_path): log.info('Converting adjacency matrix...') edge_index = dataset.edge_index('paper', 'cites', 'paper') edge_index = edge_index.T edges_new = np.zeros((edge_index.shape[0], 2)) edges_new[:, 0] = edge_index[:, 1] edges_new[:, 1] = edge_index[:, 0] edge_index = np.vstack((edge_index, edges_new)) # edge_index = np.unique(edge_index, axis=0) graph = Graph(edge_index) graph.adj_dst_index graph.dump(paper_edge_path) log.info(f'Done! [{time.perf_counter() - t:.2f}s]') edge_path = f'{dataset.dir}/full_edge_symmetric_pgl' t = time.perf_counter() if not osp.exists(edge_path): log.info('Converting adjacency matrix...') # paper log.info('adding paper edges') paper_graph = Graph.load(paper_edge_path, mmap_mode='r+') rows, cols = [paper_graph.edges[:, 0]], [paper_graph.edges[:, 1]] # author log.info('adding author edges') edge_index = dataset.edge_index('author', 'writes', 'paper') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] row += dataset.num_papers rows += [row, col] cols += [col, row] # institution log.info('adding institution edges') edge_index = dataset.edge_index('author', 'institution') edge_index = edge_index.T row, col = edge_index[:, 0], edge_index[:, 1] row += dataset.num_papers col += dataset.num_papers + dataset.num_authors rows += [row, col] cols += [col, row] # edge_type log.info('building edge type') edge_types = [ np.full(x.shape, i, dtype='int32') for i, x in enumerate(rows) ] edge_types = np.concatenate(edge_types, axis=0) log.info('building edges') row = np.concatenate(rows, axis=0) del rows col = np.concatenate(cols, axis=0) del cols edge_index = np.stack([row, col], axis=1) N = dataset.num_papers + dataset.num_authors + dataset.num_institutions full_graph = Graph(edge_index, num_nodes=N, edge_feat={'edge_type': edge_types}) full_graph.adj_dst_index full_graph.dump(edge_path) log.info( f'Done! finish full_edge [{time.perf_counter() - t:.2f}s]') path = f'{dataset.dir}/full_feat.npy' author_feat_path = f'{dataset.dir}/author_feat.npy' institution_feat_path = f'{dataset.dir}/institution_feat.npy' t = time.perf_counter() if not osp.exists(path): # Will take ~3 hours... print('Generating full feature matrix...') node_chunk_size = 100000 N = (dataset.num_papers + dataset.num_authors + dataset.num_institutions) paper_feat = dataset.paper_feat author_feat = np.memmap(author_feat_path, dtype=np.float16, shape=(dataset.num_authors, self.num_features), mode='r') institution_feat = np.memmap(institution_feat_path, dtype=np.float16, shape=(dataset.num_institutions, self.num_features), mode='r') x = np.memmap(path, dtype=np.float16, mode='w+', shape=(N, self.num_features)) print('Copying paper features...') start_idx = 0 end_idx = dataset.num_papers for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = paper_feat[i:j] del paper_feat print('Copying author feature...') start_idx = dataset.num_papers end_idx = dataset.num_papers + dataset.num_authors for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = author_feat[i - start_idx:j - start_idx] del author_feat print('Copying institution feature...') start_idx = dataset.num_papers + dataset.num_authors end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions for i in tqdm(range(start_idx, end_idx, node_chunk_size)): j = min(i + node_chunk_size, end_idx) x[i:j] = institution_feat[i - start_idx:j - start_idx] del institution_feat x.flush() del x print(f'Done! [{time.perf_counter() - t:.2f}s]') np.random.seed(self.seed) self.train_idx = dataset.get_idx_split('train') np.random.shuffle(self.train_idx) self.val_idx = dataset.get_idx_split('valid') self.test_idx = dataset.get_idx_split('test') N = dataset.num_papers + dataset.num_authors + dataset.num_institutions self.x = np.memmap(f'{dataset.dir}/full_feat.npy', dtype=np.float16, mode='r', shape=(N, self.num_features)) self.y = dataset.all_paper_label self.graph = Graph.load(edge_path, mmap_mode='r+') self.graph._edge_feat['edge_type'] = self.graph._edge_feat[ 'edge_type'].astype('int32') log.info(f'Done! [{time.perf_counter() - t:.2f}s]')