Example #1
0
def read_heterograph_dgl(raw_dir,
                         add_inverse_edge=False,
                         additional_node_files=[],
                         additional_edge_files=[],
                         binary=False):

    if binary:
        # npz
        graph_list = read_binary_heterograph_raw(raw_dir, add_inverse_edge)
    else:
        # csv
        graph_list = read_csv_heterograph_raw(
            raw_dir,
            add_inverse_edge,
            additional_node_files=additional_node_files,
            additional_edge_files=additional_edge_files)

    dgl_graph_list = []

    print('Converting graphs into DGL objects...')

    for graph in tqdm(graph_list):
        g_dict = {}

        # add edge connectivity
        for triplet, edge_index in graph['edge_index_dict'].items():
            edge_tuple = [(i, j)
                          for i, j in zip(graph['edge_index_dict'][triplet][0],
                                          graph['edge_index_dict'][triplet][1])
                          ]
            g_dict[triplet] = edge_tuple

        dgl_hetero_graph = dgl.heterograph(
            g_dict, num_nodes_dict=graph['num_nodes_dict'])

        if graph['edge_feat_dict'] is not None:
            for triplet in graph['edge_feat_dict'].keys():
                dgl_hetero_graph.edges[triplet].data[
                    'feat'] = torch.from_numpy(
                        graph['edge_feat_dict'][triplet])

        if graph['node_feat_dict'] is not None:
            for nodetype in graph['node_feat_dict'].keys():
                dgl_hetero_graph.nodes[nodetype].data[
                    'feat'] = torch.from_numpy(
                        graph['node_feat_dict'][nodetype])

        for key in additional_node_files:
            for nodetype in graph[key].keys():
                dgl_hetero_graph.nodes[nodetype].data[
                    key[5:]] = torch.from_numpy(graph[key][nodetype])

        for key in additional_edge_files:
            for triplet in graph[key].keys():
                dgl_hetero_graph.edges[triplet].data[
                    key[5:]] = torch.from_numpy(graph[key][triplet])

        dgl_graph_list.append(dgl_hetero_graph)

    return dgl_graph_list
Example #2
0
def read_csv_heterograph_dgl(raw_dir, add_inverse_edge = False, additional_node_files = [], additional_edge_files = []):

    graph_list = read_csv_heterograph_raw(raw_dir, add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files)
    dgl_graph_list = []

    print('Converting graphs into DGL objects...')

    for graph in tqdm(graph_list):
        g_dict = {}

        # add edge connectivity
        for triplet, edge_index in graph["edge_index_dict"].items():
            edge_tuple = [(i, j) for i, j in zip(graph["edge_index_dict"][triplet][0], graph["edge_index_dict"][triplet][1])]
            g_dict[triplet] = edge_tuple

        dgl_hetero_graph = dgl.heterograph(g_dict)

        if graph["edge_feat_dict"] is not None:
            for triplet in graph["edge_feat_dict"].keys():
                dgl_hetero_graph.edges[triplet].data["feat"] = torch.from_numpy(graph["edge_feat_dict"][triplet])

        if graph["node_feat_dict"] is not None:
            for nodetype in graph["node_feat_dict"].keys():
                dgl_hetero_graph.nodes[nodetype].data["feat"] = torch.from_numpy(graph["node_feat_dict"][nodetype])

        for key in additional_node_files:
            if 'node_' not in key:
                feat_name = 'node_' + key
            else:
                feat_name = key

            for nodetype in graph[feat_name].keys():
                dgl_hetero_graph.nodes[nodetype].data[feat_name] = torch.from_numpy(graph[feat_name][nodetype])

        for key in additional_edge_files:
            if 'edge_' not in key:
                feat_name = 'edge_' + key
            else:
                feat_name = key

            for triplet in graph[feat_name].keys():
                dgl_hetero_graph.edges[triplet].data[feat_name] = torch.from_numpy(graph[feat_name][triplet])

        dgl_graph_list.append(dgl_hetero_graph)


    return dgl_graph_list
Example #3
0
def read_csv_heterograph_pyg(raw_dir, add_inverse_edge = False, additional_node_files = [], additional_edge_files = []):

    graph_list = read_csv_heterograph_raw(raw_dir, add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files)
    pyg_graph_list = []

    print('Converting graphs into PyG objects...')

    for graph in tqdm(graph_list):
        g = Data()
        
        g.__num_nodes__ = graph["num_nodes_dict"]
        g.num_nodes_dict = graph["num_nodes_dict"]

        # add edge connectivity
        g.edge_index_dict = {}
        for triplet, edge_index in graph["edge_index_dict"].items():
            g.edge_index_dict[triplet] = torch.from_numpy(edge_index)

        if graph["edge_feat_dict"] is not None:
            g.edge_attr_dict = {}
            for triplet in graph["edge_feat_dict"].keys():
                g.edge_attr_dict[triplet] = torch.from_numpy(graph["edge_feat_dict"][triplet])

        if graph["node_feat_dict"] is not None:
            g.x_dict = {}
            for nodetype in graph["node_feat_dict"].keys():
                g.x_dict[nodetype] = torch.from_numpy(graph["node_feat_dict"][nodetype])

        for key in additional_edge_files:
            g[key + '_dict'] = {}
            for triplet in graph[key].keys():
                g[key + '_dict'][triplet] = torch.from_numpy(graph[key][triplet])

        for key in additional_node_files:
            g[key + '_dict'] = {}
            for nodetype in graph[key].keys():
                g[key + '_dict'][nodetype] = torch.from_numpy(graph[key][nodetype])

        pyg_graph_list.append(g)


    return pyg_graph_list
Example #4
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if osp.exists(pre_processed_file_path):
            # self.graph = torch.load(pre_processed_file_path, 'rb')
            self.graph = load_pickle(pre_processed_file_path)

        else:
            ### check download
            if self.binary:
                # npz format
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'data.npz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge_index_dict.npz')) and self.is_hetero
            else:
                # csv file
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge.csv.gz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'triplet-type-list.csv.gz')) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero

            if not has_necessary_file:
                url = self.meta_info['url']
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print('Stop download.')
                    exit(-1)

            raw_dir = osp.join(self.root, 'raw')

            ### pre-process and save
            add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

            if self.meta_info['additional node files'] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    'additional node files'].split(',')

            if self.meta_info['additional edge files'] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    'additional edge files'].split(',')

            if self.is_hetero:
                if self.binary:
                    self.graph = read_binary_heterograph_raw(
                        raw_dir, add_inverse_edge=add_inverse_edge)[
                            0]  # only a single graph
                else:
                    self.graph = read_csv_heterograph_raw(
                        raw_dir,
                        add_inverse_edge=add_inverse_edge,
                        additional_node_files=additional_node_files,
                        additional_edge_files=additional_edge_files)[
                            0]  # only a single graph

            else:
                if self.binary:
                    self.graph = read_binary_graph_raw(
                        raw_dir, add_inverse_edge=add_inverse_edge)[
                            0]  # only a single graph
                else:
                    self.graph = read_csv_graph_raw(
                        raw_dir,
                        add_inverse_edge=add_inverse_edge,
                        additional_node_files=additional_node_files,
                        additional_edge_files=additional_edge_files)[
                            0]  # only a single graph

            print('Saving...')

            # torch.save(self.graph, pre_processed_file_path, pickle_protocol=4)
            dump_pickle(self.graph, pre_processed_file_path)
Example #5
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if osp.exists(pre_processed_file_path):
            self.graph = torch.load(pre_processed_file_path, 'rb')

        else:
            ### check download
            has_necessary_file_simple = osp.exists(
                osp.join(self.root, "raw",
                         "edge.csv.gz")) and (not self.is_hetero)
            has_necessary_file_hetero = osp.exists(
                osp.join(self.root, "raw",
                         "triplet-type-list.csv.gz")) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero

            if not has_necessary_file:
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            ### pre-process and save
            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"

            if self.meta_info[self.name]["additional node files"] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    self.name]["additional node files"].split(',')

            if self.meta_info[self.name]["additional edge files"] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    self.name]["additional edge files"].split(',')

            if self.is_hetero:
                self.graph = read_csv_heterograph_raw(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)[
                        0]  # only a single graph

            else:
                self.graph = read_csv_graph_raw(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)[
                        0]  # only a single graph

            print('Saving...')
            torch.save(self.graph, pre_processed_file_path, pickle_protocol=4)
Example #6
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'data_processed')

        if osp.exists(pre_processed_file_path):
            # loaded_dict = torch.load(pre_processed_file_path)
            loaded_dict = load_pickle(pre_processed_file_path)
            self.graph, self.labels = loaded_dict['graph'], loaded_dict[
                'labels']

        else:
            ### check download
            if self.binary:
                # npz format
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'data.npz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge_index_dict.npz')) and self.is_hetero
            else:
                # csv file
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge.csv.gz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'triplet-type-list.csv.gz')) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero

            if not has_necessary_file:
                url = self.meta_info['url']
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print('Stop download.')
                    exit(-1)

            raw_dir = osp.join(self.root, 'raw')

            ### pre-process and save
            add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

            if self.meta_info['additional node files'] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    'additional node files'].split(',')

            if self.meta_info['additional edge files'] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    'additional edge files'].split(',')

            if self.is_hetero:
                if self.binary:
                    self.graph = read_binary_heterograph_raw(
                        raw_dir, add_inverse_edge=add_inverse_edge)[
                            0]  # only a single graph

                    tmp = np.load(osp.join(raw_dir, 'node-label.npz'))
                    self.labels = {}
                    for key in list(tmp.keys()):
                        self.labels[key] = tmp[key]
                    del tmp
                else:
                    self.graph = read_csv_heterograph_raw(
                        raw_dir,
                        add_inverse_edge=add_inverse_edge,
                        additional_node_files=additional_node_files,
                        additional_edge_files=additional_edge_files)[
                            0]  # only a single graph
                    self.labels = read_node_label_hetero(raw_dir)

            else:
                if self.binary:
                    self.graph = read_binary_graph_raw(
                        raw_dir, add_inverse_edge=add_inverse_edge)[
                            0]  # only a single graph
                    self.labels = np.load(osp.join(
                        raw_dir, 'node-label.npz'))['node_label']
                else:
                    self.graph = read_csv_graph_raw(
                        raw_dir,
                        add_inverse_edge=add_inverse_edge,
                        additional_node_files=additional_node_files,
                        additional_edge_files=additional_edge_files)[
                            0]  # only a single graph
                    self.labels = pd.read_csv(osp.join(raw_dir,
                                                       'node-label.csv.gz'),
                                              compression='gzip',
                                              header=None).values

            print('Saving...')
            # torch.save({'graph': self.graph, 'labels': self.labels}, pre_processed_file_path, pickle_protocol=4)
            dump_pickle({
                'graph': self.graph,
                'labels': self.labels
            }, pre_processed_file_path)