def read_heterograph_dgl(raw_dir, add_inverse_edge=False, additional_node_files=[], additional_edge_files=[], binary=False): if binary: # npz graph_list = read_binary_heterograph_raw(raw_dir, add_inverse_edge) else: # csv graph_list = read_csv_heterograph_raw( raw_dir, add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files) dgl_graph_list = [] print('Converting graphs into DGL objects...') for graph in tqdm(graph_list): g_dict = {} # add edge connectivity for triplet, edge_index in graph['edge_index_dict'].items(): edge_tuple = [(i, j) for i, j in zip(graph['edge_index_dict'][triplet][0], graph['edge_index_dict'][triplet][1]) ] g_dict[triplet] = edge_tuple dgl_hetero_graph = dgl.heterograph( g_dict, num_nodes_dict=graph['num_nodes_dict']) if graph['edge_feat_dict'] is not None: for triplet in graph['edge_feat_dict'].keys(): dgl_hetero_graph.edges[triplet].data[ 'feat'] = torch.from_numpy( graph['edge_feat_dict'][triplet]) if graph['node_feat_dict'] is not None: for nodetype in graph['node_feat_dict'].keys(): dgl_hetero_graph.nodes[nodetype].data[ 'feat'] = torch.from_numpy( graph['node_feat_dict'][nodetype]) for key in additional_node_files: for nodetype in graph[key].keys(): dgl_hetero_graph.nodes[nodetype].data[ key[5:]] = torch.from_numpy(graph[key][nodetype]) for key in additional_edge_files: for triplet in graph[key].keys(): dgl_hetero_graph.edges[triplet].data[ key[5:]] = torch.from_numpy(graph[key][triplet]) dgl_graph_list.append(dgl_hetero_graph) return dgl_graph_list
def read_csv_heterograph_dgl(raw_dir, add_inverse_edge = False, additional_node_files = [], additional_edge_files = []): graph_list = read_csv_heterograph_raw(raw_dir, add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files) dgl_graph_list = [] print('Converting graphs into DGL objects...') for graph in tqdm(graph_list): g_dict = {} # add edge connectivity for triplet, edge_index in graph["edge_index_dict"].items(): edge_tuple = [(i, j) for i, j in zip(graph["edge_index_dict"][triplet][0], graph["edge_index_dict"][triplet][1])] g_dict[triplet] = edge_tuple dgl_hetero_graph = dgl.heterograph(g_dict) if graph["edge_feat_dict"] is not None: for triplet in graph["edge_feat_dict"].keys(): dgl_hetero_graph.edges[triplet].data["feat"] = torch.from_numpy(graph["edge_feat_dict"][triplet]) if graph["node_feat_dict"] is not None: for nodetype in graph["node_feat_dict"].keys(): dgl_hetero_graph.nodes[nodetype].data["feat"] = torch.from_numpy(graph["node_feat_dict"][nodetype]) for key in additional_node_files: if 'node_' not in key: feat_name = 'node_' + key else: feat_name = key for nodetype in graph[feat_name].keys(): dgl_hetero_graph.nodes[nodetype].data[feat_name] = torch.from_numpy(graph[feat_name][nodetype]) for key in additional_edge_files: if 'edge_' not in key: feat_name = 'edge_' + key else: feat_name = key for triplet in graph[feat_name].keys(): dgl_hetero_graph.edges[triplet].data[feat_name] = torch.from_numpy(graph[feat_name][triplet]) dgl_graph_list.append(dgl_hetero_graph) return dgl_graph_list
def read_csv_heterograph_pyg(raw_dir, add_inverse_edge = False, additional_node_files = [], additional_edge_files = []): graph_list = read_csv_heterograph_raw(raw_dir, add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files) pyg_graph_list = [] print('Converting graphs into PyG objects...') for graph in tqdm(graph_list): g = Data() g.__num_nodes__ = graph["num_nodes_dict"] g.num_nodes_dict = graph["num_nodes_dict"] # add edge connectivity g.edge_index_dict = {} for triplet, edge_index in graph["edge_index_dict"].items(): g.edge_index_dict[triplet] = torch.from_numpy(edge_index) if graph["edge_feat_dict"] is not None: g.edge_attr_dict = {} for triplet in graph["edge_feat_dict"].keys(): g.edge_attr_dict[triplet] = torch.from_numpy(graph["edge_feat_dict"][triplet]) if graph["node_feat_dict"] is not None: g.x_dict = {} for nodetype in graph["node_feat_dict"].keys(): g.x_dict[nodetype] = torch.from_numpy(graph["node_feat_dict"][nodetype]) for key in additional_edge_files: g[key + '_dict'] = {} for triplet in graph[key].keys(): g[key + '_dict'][triplet] = torch.from_numpy(graph[key][triplet]) for key in additional_node_files: g[key + '_dict'] = {} for nodetype in graph[key].keys(): g[key + '_dict'][nodetype] = torch.from_numpy(graph[key][nodetype]) pyg_graph_list.append(g) return pyg_graph_list
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): # self.graph = torch.load(pre_processed_file_path, 'rb') self.graph = load_pickle(pre_processed_file_path) else: ### check download if self.binary: # npz format has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'data.npz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'edge_index_dict.npz')) and self.is_hetero else: # csv file has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'triplet-type-list.csv.gz')) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) raw_dir = osp.join(self.root, 'raw') ### pre-process and save add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') if self.is_hetero: if self.binary: self.graph = read_binary_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph else: self.graph = read_csv_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph else: if self.binary: self.graph = read_binary_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph else: self.graph = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph print('Saving...') # torch.save(self.graph, pre_processed_file_path, pickle_protocol=4) dump_pickle(self.graph, pre_processed_file_path)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): self.graph = torch.load(pre_processed_file_path, 'rb') else: ### check download has_necessary_file_simple = osp.exists( osp.join(self.root, "raw", "edge.csv.gz")) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, "raw", "triplet-type-list.csv.gz")) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info[self.name]["url"] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print("Stop download.") exit(-1) raw_dir = osp.join(self.root, "raw") ### pre-process and save add_inverse_edge = self.meta_info[ self.name]["add_inverse_edge"] == "True" if self.meta_info[self.name]["additional node files"] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ self.name]["additional node files"].split(',') if self.meta_info[self.name]["additional edge files"] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ self.name]["additional edge files"].split(',') if self.is_hetero: self.graph = read_csv_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph else: self.graph = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph print('Saving...') torch.save(self.graph, pre_processed_file_path, pickle_protocol=4)
def pre_process(self): processed_dir = osp.join(self.root, 'processed') pre_processed_file_path = osp.join(processed_dir, 'data_processed') if osp.exists(pre_processed_file_path): # loaded_dict = torch.load(pre_processed_file_path) loaded_dict = load_pickle(pre_processed_file_path) self.graph, self.labels = loaded_dict['graph'], loaded_dict[ 'labels'] else: ### check download if self.binary: # npz format has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'data.npz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'edge_index_dict.npz')) and self.is_hetero else: # csv file has_necessary_file_simple = osp.exists( osp.join(self.root, 'raw', 'edge.csv.gz')) and (not self.is_hetero) has_necessary_file_hetero = osp.exists( osp.join(self.root, 'raw', 'triplet-type-list.csv.gz')) and self.is_hetero has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero if not has_necessary_file: url = self.meta_info['url'] if decide_download(url): path = download_url(url, self.original_root) extract_zip(path, self.original_root) os.unlink(path) # delete folder if there exists try: shutil.rmtree(self.root) except: pass shutil.move( osp.join(self.original_root, self.download_name), self.root) else: print('Stop download.') exit(-1) raw_dir = osp.join(self.root, 'raw') ### pre-process and save add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True' if self.meta_info['additional node files'] == 'None': additional_node_files = [] else: additional_node_files = self.meta_info[ 'additional node files'].split(',') if self.meta_info['additional edge files'] == 'None': additional_edge_files = [] else: additional_edge_files = self.meta_info[ 'additional edge files'].split(',') if self.is_hetero: if self.binary: self.graph = read_binary_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph tmp = np.load(osp.join(raw_dir, 'node-label.npz')) self.labels = {} for key in list(tmp.keys()): self.labels[key] = tmp[key] del tmp else: self.graph = read_csv_heterograph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph self.labels = read_node_label_hetero(raw_dir) else: if self.binary: self.graph = read_binary_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge)[ 0] # only a single graph self.labels = np.load(osp.join( raw_dir, 'node-label.npz'))['node_label'] else: self.graph = read_csv_graph_raw( raw_dir, add_inverse_edge=add_inverse_edge, additional_node_files=additional_node_files, additional_edge_files=additional_edge_files)[ 0] # only a single graph self.labels = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression='gzip', header=None).values print('Saving...') # torch.save({'graph': self.graph, 'labels': self.labels}, pre_processed_file_path, pickle_protocol=4) dump_pickle({ 'graph': self.graph, 'labels': self.labels }, pre_processed_file_path)