def read_graph_pyg(raw_dir, add_inverse_edge=False, additional_node_files=[], additional_edge_files=[], binary=False): if binary: # npz graph_list = read_binary_graph_raw(raw_dir, add_inverse_edge) else: # csv graph_list = read_csv_graph_raw( raw_dir, add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files) pyg_graph_list = [] print('Converting graphs into PyG objects...') for graph in tqdm(graph_list): g = Data() g.__num_nodes__ = graph['num_nodes'] g.edge_index = torch.from_numpy(graph['edge_index']) del graph['num_nodes'] del graph['edge_index'] if graph['edge_feat'] is not None: g.edge_attr = torch.from_numpy(graph['edge_feat']) del graph['edge_feat'] if graph['node_feat'] is not None: g.x = torch.from_numpy(graph['node_feat']) del graph['node_feat'] for key in additional_node_files: g[key] = torch.from_numpy(graph[key]) del graph[key] for key in additional_edge_files: g[key] = torch.from_numpy(graph[key]) del graph[key] pyg_graph_list.append(g) add_order_info_01(g) # DAGNN # length of longest path # layer ids start with 0 so max, gives actual path length and -1 is not necessary g.len_longest_path = float(torch.max(g._bi_layer_idx0).item()) return pyg_graph_list
def read_graph_pyg(raw_dir, add_inverse_edge=False, additional_node_files=[], additional_edge_files=[], binary=False): if binary: # npz graph_list = read_binary_graph_raw(raw_dir, add_inverse_edge) else: # csv graph_list = read_csv_graph_raw( raw_dir, add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files) pyg_graph_list = [] print('Converting graphs into PyG objects...') for graph in tqdm(graph_list): g = Data() g.__num_nodes__ = graph['num_nodes'] g.edge_index = torch.from_numpy(graph['edge_index']) del graph['num_nodes'] del graph['edge_index'] if graph['edge_feat'] is not None: g.edge_attr = torch.from_numpy(graph['edge_feat']) del graph['edge_feat'] if graph['node_feat'] is not None: g.x = torch.from_numpy(graph['node_feat']) del graph['node_feat'] for key in additional_node_files: g[key] = torch.from_numpy(graph[key]) del graph[key] for key in additional_edge_files: g[key] = torch.from_numpy(graph[key]) del graph[key] pyg_graph_list.append(g) return pyg_graph_list
def read_graph_dgl(raw_dir, add_inverse_edge=False, additional_node_files=[], additional_edge_files=[], binary=False): if binary: # npz graph_list = read_binary_graph_raw(raw_dir, add_inverse_edge) else: # csv graph_list = read_csv_graph_raw( raw_dir, add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files) dgl_graph_list = [] print('Converting graphs into DGL objects...') for graph in tqdm(graph_list): g = dgl.graph((graph['edge_index'][0], graph['edge_index'][1]), num_nodes=graph['num_nodes']) if graph['edge_feat'] is not None: g.edata['feat'] = torch.from_numpy(graph['edge_feat']) if graph['node_feat'] is not None: g.ndata['feat'] = torch.from_numpy(graph['node_feat']) for key in additional_node_files: g.ndata[key[5:]] = torch.from_numpy(graph[key]) for key in additional_edge_files: g.edata[key[5:]] = torch.from_numpy(graph[key]) dgl_graph_list.append(g) return dgl_graph_list
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): # self.graph = torch.load(pre_processed_file_path, 'rb') self.graph = load_pickle(pre_processed_file_path) else: ### check download if self.binary: # npz format has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'data.npz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'edge_index_dict.npz')) and self.is_hetero else: # csv file has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'triplet-type-list.csv.gz')) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) raw_dir = osp.join(self.root, 'raw') ### pre-process and save add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') if self.is_hetero: if self.binary: self.graph = read_binary_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph else: self.graph = read_csv_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph else: if self.binary: self.graph = read_binary_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph else: self.graph = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph print('Saving...') # torch.save(self.graph, pre_processed_file_path, pickle_protocol=4) dump_pickle(self.graph, pre_processed_file_path)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): # loaded_dict = torch.load(pre_processed_file_path) loaded_dict = load_pickle(pre_processed_file_path) self.graph, self.labels = loaded_dict['graph'], loaded_dict[ 'labels'] else: ### check download if self.binary: # npz format has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'data.npz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'edge_index_dict.npz')) and self.is_hetero else: # csv file has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'triplet-type-list.csv.gz')) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) raw_dir = osp.join(self.root, 'raw') ### pre-process and save add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') if self.is_hetero: if self.binary: self.graph = read_binary_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph tmp = np.load(osp.join(raw_dir, 'node-label.npz')) self.labels = {} for key in list(tmp.keys()): self.labels[key] = tmp[key] del tmp else: self.graph = read_csv_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph self.labels = read_node_label_hetero(raw_dir) else: if self.binary: self.graph = read_binary_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph self.labels = np.load(osp.join( raw_dir, 'node-label.npz'))['node_label'] else: self.graph = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph self.labels = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression='gzip', header=None).values print('Saving...') # torch.save({'graph': self.graph, 'labels': self.labels}, pre_processed_file_path, pickle_protocol=4) dump_pickle({ 'graph': self.graph, 'labels': self.labels }, pre_processed_file_path)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') raw_dir = osp.join(self.root, 'raw') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if os.path.exists(pre_processed_file_path): loaded_dict = torch.load(pre_processed_file_path, 'rb') self.graphs, self.labels = loaded_dict['graphs'], loaded_dict[ 'labels'] else: ### check download if self.binary: # npz format has_necessary_file = osp.exists( osp.join(self.root, 'raw', 'data.npz')) else: # csv file has_necessary_file = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) ### download if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) ### preprocess add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') if self.binary: self.graphs = read_binary_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge) else: self.graphs = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files) if self.task_type == 'subtoken prediction': labels_joined = pd.read_csv(osp.join(raw_dir, 'graph-label.csv.gz'), compression='gzip', header=None).values # need to split each element into subtokens self.labels = [ str(labels_joined[i][0]).split(' ') for i in range(len(labels_joined)) ] else: if self.binary: self.labels = np.load(osp.join( raw_dir, 'graph-label.npz'))['graph_label'] else: self.labels = pd.read_csv(osp.join(raw_dir, 'graph-label.csv.gz'), compression='gzip', header=None).values print('Saving...') torch.save({ 'graphs': self.graphs, 'labels': self.labels }, pre_processed_file_path, pickle_protocol=4)
def _save_graph_list_homo(self, graph_list): dict_keys = graph_list[0].keys() # check necessary keys if not 'edge_index' in dict_keys: raise RuntimeError( 'edge_index needs to be provided in graph objects') if not 'num_nodes' in dict_keys: raise RuntimeError( 'num_nodes needs to be provided in graph objects') print(dict_keys) data_dict = {} # Store the following keys # - edge_index (necessary) # - num_nodes_list (necessary) # - num_edges_list (necessary) # - node_** (optional, node_feat is the default node features) # - edge_** (optional, edge_feat is the default edge features) # saving num_nodes_list num_nodes_list = np.array([graph['num_nodes'] for graph in graph_list]).astype(np.int64) data_dict['num_nodes_list'] = num_nodes_list # saving edge_index and num_edges_list print('Saving edge_index') edge_index = np.concatenate( [graph['edge_index'] for graph in graph_list], axis=1).astype(np.int64) num_edges_list = np.array([ graph['edge_index'].shape[1] for graph in graph_list ]).astype(np.int64) if edge_index.shape[0] != 2: raise RuntimeError('edge_index must have shape (2, num_edges)') data_dict['edge_index'] = edge_index data_dict['num_edges_list'] = num_edges_list for key in dict_keys: if key == 'edge_index' or key == 'num_nodes': continue if graph_list[0][key] is None: continue if 'node_' == key[:5]: # make sure saved in np.int64 or np.float32 dtype = np.int64 if 'int' in str( graph_list[0][key].dtype) else np.float32 # check num_nodes for i in range(len(graph_list)): if len(graph_list[i][key]) != num_nodes_list[i]: raise RuntimeError(f'num_nodes mistmatches with {key}') cat_feat = np.concatenate([graph[key] for graph in graph_list], axis=0).astype(dtype) data_dict[key] = cat_feat elif 'edge_' == key[:5]: # make sure saved in np.int64 or np.float32 dtype = np.int64 if 'int' in str( graph_list[0][key].dtype) else np.float32 # check num_edges for i in range(len(graph_list)): if len(graph_list[i][key]) != num_edges_list[i]: raise RuntimeError(f'num_edges mistmatches with {key}') cat_feat = np.concatenate([graph[key] for graph in graph_list], axis=0).astype(dtype) data_dict[key] = cat_feat else: raise RuntimeError( f'Keys in graph object should start from either \'node_\' or \'edge_\', but \'{key}\' given.' ) print('Saving all the files!') np.savez_compressed(osp.join(self.raw_dir, 'data.npz'), **data_dict) print('Validating...') # testing print('Reading saved files') graph_list_read = read_binary_graph_raw(self.raw_dir, False) print('Checking read graphs and given graphs are the same') for i in tqdm(range(len(graph_list))): # assert(graph_list[i].keys() == graph_list_read[i].keys()) for key in graph_list[i].keys(): if graph_list[i][key] is not None: if isinstance(graph_list[i][key], np.ndarray): assert (np.allclose(graph_list[i][key], graph_list_read[i][key], rtol=1e-4, atol=1e-4, equal_nan=True)) else: assert (graph_list[i][key] == graph_list_read[i][key]) del graph_list_read