Beispiel #1
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)
        edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl'

        t = time.perf_counter()
        if not osp.exists(edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]

            edge_index = np.vstack((edge_index, edges_new))
            edge_index = np.unique(edge_index, axis=0)

            graph = Graph(edge_index, sorted=True)
            graph.adj_dst_index
            graph.dump(edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        np.random.seed(self.seed)
        self.train_idx = dataset.get_idx_split('train')
        np.random.shuffle(self.train_idx)

        self.val_idx = dataset.get_idx_split('valid')
        self.test_idx = dataset.get_idx_split('test')

        self.x = dataset.paper_feat
        self.y = dataset.all_paper_label

        self.graph = Graph.load(edge_path, mmap_mode='r+')
        log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Beispiel #2
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        log.info(dataset.num_authors)
        log.info(dataset.num_institutions)

        author_path = f'{dataset.dir}/author_feat_year.npy'
        path = f'{dataset.dir}/institution_feat_year.npy'
        t = time.perf_counter()
        if not osp.exists(path):
            log.info('get institution_feat...')

            author_feat = np.memmap(author_path,
                                    dtype=np.int32,
                                    mode='r',
                                    shape=(dataset.num_authors, ))
            author_feat = author_feat[:]
            author_feat = np.expand_dims(author_feat, axis=1)
            # author
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            log.info(edge_index.shape)
            institution_graph = Graph(edge_index,
                                      num_nodes=dataset.num_institutions)
            institution_graph.tensor()
            log.info('finish institution graph')

            institution_x = np.memmap(path,
                                      dtype=np.int32,
                                      mode='w+',
                                      shape=(dataset.num_institutions, ))

            degree = paddle.zeros(shape=[dataset.num_institutions, 1],
                                  dtype='float32')
            temp_one = paddle.ones(shape=[edge_index.shape[0], 1],
                                   dtype='float32')
            degree = scatter(degree,
                             overwrite=False,
                             index=institution_graph.edges[:, 1],
                             updates=temp_one)
            log.info('finish degree')

            inputs = author_feat

            inputs = paddle.to_tensor(inputs, dtype='float32')
            outputs = institution_graph.send_recv(inputs)
            outputs = outputs / degree
            outputs = outputs.astype('int32').numpy()

            del inputs
            save_col_slice(x_src=outputs,
                           x_dst=institution_x,
                           start_row_idx=0,
                           end_row_idx=dataset.num_institutions)
            del outputs

            institution_x.flush()
            del institution_x
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Beispiel #3
0
 def setUp(self):
     num_nodes = 5
     edges = [(0, 1), (1, 2), (3, 4)]
     feature = np.random.randn(5, 100)
     edge_feature = np.random.randn(3, 100)
     self.graph = Graph(num_nodes=num_nodes,
                        edges=edges,
                        node_feat={"feature": feature},
                        edge_feat={"edge_feature": edge_feature})
Beispiel #4
0
    def __init__(self,
                 edges,
                 num_nodes=None,
                 node_types=None,
                 node_feat=None,
                 edge_feat=None,
                 **kwargs):

        self._edges_dict = edges

        if isinstance(node_types, list):
            self._node_types = np.array(node_types, dtype=object)[:, 1]
        else:
            self._node_types = node_types

        if num_nodes is None:
            self._num_nodes = len(node_types)
        else:
            self._num_nodes = num_nodes

        self._nodes_type_dict = {}
        for ntype in np.unique(self._node_types):
            self._nodes_type_dict[ntype] = np.where(
                self._node_types == ntype)[0]

        if node_feat is not None:
            self._node_feat = node_feat
        else:
            self._node_feat = {}

        if edge_feat is not None:
            self._edge_feat = edge_feat
        else:
            self._edge_feat = {}

        if "multi_graph" in kwargs.keys():
            self._multi_graph = kwargs["multi_graph"]
        else:
            self._multi_graph = {}
            for etype, _edges in self._edges_dict.items():
                if not self._edge_feat:
                    edge_feat = None
                else:
                    edge_feat = self._edge_feat[etype]

                self._multi_graph[etype] = Graph(edges=_edges,
                                                 num_nodes=self._num_nodes,
                                                 node_feat=copy.deepcopy(
                                                     self._node_feat),
                                                 edge_feat=edge_feat)

        self._edge_types = self.edge_types_info()
        self._nodes = None

        for etype, g in self._multi_graph.items():
            if g.is_tensor():
                self._is_tensor = True
            else:
                self._is_tensor = False
            break
Beispiel #5
0
    def load(cls, path, mmap_mode="r"):
        """Load HeterGraph from path and return a HeterGraph instance in numpy. 

        Args:

            path: The directory path of the stored HeterGraph.

            mmap_mode: Default :code:`mmap_mode="r"`. If not None, memory-map the graph.  
        """

        _node_types = np.load(os.path.join(path, "node_types.npy"),
                              allow_pickle=True)

        with open(os.path.join(path, "edge_types.pkl"), "rb") as f:
            _edge_types = pkl.load(f)

        _multi_graph = {}

        for etype in _edge_types:
            sub_path = os.path.join(path, etype)
            _multi_graph[etype] = Graph.load(sub_path, mmap_mode)

        return cls(
            edges=None,
            node_types=_node_types,
            multi_graph=_multi_graph,
        )
Beispiel #6
0
def subgraph(graph,
             nodes,
             eid=None,
             edges=None,
             with_node_feat=True,
             with_edge_feat=True):
    """Generate subgraph with nodes and edge ids.
    This function will generate a :code:`pgl.graph.Subgraph` object and
    copy all corresponding node and edge features. Nodes and edges will
    be reindex from 0. Eid and edges can't both be None.
    WARNING: ALL NODES IN EID MUST BE INCLUDED BY NODES

    Args:
        nodes: Node ids which will be included in the subgraph.
        eid (optional): Edge ids which will be included in the subgraph.
        edges (optional): Edge(src, dst) list which will be included in the subgraph.

        with_node_feat: Whether to inherit node features from parent graph.
        with_edge_feat: Whether to inherit edge features from parent graph.

    Return:
        A :code:`pgl.Graph` object.
    """
    assert not graph.is_tensor(), "You must call Graph.numpy() first."

    if eid is None and edges is None:
        raise ValueError("Eid and edges can't be None at the same time.")

    reindex = {}

    for ind, node in enumerate(nodes):
        reindex[node] = ind

    sub_edge_feat = {}
    if edges is None:
        edges = graph._edges[eid]
    else:
        edges = np.array(edges, dtype="int64")

    if with_edge_feat:
        for key, value in graph._edge_feat.items():
            if eid is None:
                raise ValueError("Eid can not be None with edge features.")
            sub_edge_feat[key] = value[eid]

    sub_edges = pgl.graph_kernel.map_edges(
        np.arange(len(edges), dtype="int64"), edges, reindex)

    sub_node_feat = {}
    if with_node_feat:
        for key, value in graph._node_feat.items():
            sub_node_feat[key] = value[nodes]

    g = Graph(edges=sub_edges,
              num_nodes=len(nodes),
              node_feat=sub_node_feat,
              edge_feat=sub_edge_feat)

    return g
Beispiel #7
0
def build_graph(num_nodes, edge_path):
    filelist = []
    if os.path.isfile(edge_path):
        filelist = [edge_path]
    elif os.path.isdir(edge_path):
        filelist = [
            os.path.join(dp, f)
            for dp, dn, filenames in os.walk(edge_path) for f in filenames
        ]
    else:
        raise ValueError(edge_path + " not supported")
    edges, edge_weight = [], []
    for name in filelist:
        with open(name) as inf:
            for line in inf:
                slots = line.strip("\n").split()
                edges.append([slots[0], slots[1]])
                edges.append([slots[1], slots[0]])
                if len(slots) > 2:
                    edge_weight.extend([float(slots[2]), float(slots[2])])
    edges = np.array(edges, dtype="int64")
    assert num_nodes > edges.max(
    ), "Node id in any edges should be smaller then num_nodes!"

    edge_feat = dict()
    if len(edge_weight) == len(edges):
        edge_feat["weight"] = np.array(edge_weight)

    graph = Graph(num_nodes, edges, edge_feat=edge_feat)
    log.info("Build graph done")

    graph.outdegree()

    del edges, edge_feat

    log.info("Build graph index done")
    if "weight" in graph.edge_feat:
        graph.node_feat["alias"], graph.node_feat[
            "events"] = graph_alias_sample_table(graph, "weight")
        log.info("Build graph alias sample table done")
    return graph
Beispiel #8
0
    def _load_data(self):
        """Load data"""
        content = os.path.join(self.path, 'cora.content')
        cite = os.path.join(self.path, 'cora.cites')
        node_feature = []
        paper_ids = []
        y = []
        y_dict = {}
        with open(content, 'r') as f:
            for line in f:
                line = line.strip().split()
                paper_id = int(line[0])
                paper_class = line[-1]
                if paper_class not in y_dict:
                    y_dict[paper_class] = len(y_dict)
                feature = [int(i) for i in line[1:-1]]
                feature_array = np.array(feature, dtype="float32")
                # Normalize
                feature_array = feature_array / (np.sum(feature_array) + 1e-15)
                node_feature.append(feature_array)
                y.append(y_dict[paper_class])
                paper_ids.append(paper_id)
        paper2vid = dict([(v, k) for (k, v) in enumerate(paper_ids)])
        num_nodes = len(paper_ids)
        node_feature = np.array(node_feature, dtype="float32")

        all_edges = []
        with open(cite, 'r') as f:
            for line in f:
                u, v = line.split()
                u = paper2vid[int(u)]
                v = paper2vid[int(v)]
                all_edges.append((u, v))
                if self.symmetry_edges:
                    all_edges.append((v, u))

        if self.self_loop:
            for i in range(num_nodes):
                all_edges.append((i, i))

        all_edges = list(set(all_edges))
        self.graph = Graph(num_nodes=num_nodes,
                           edges=all_edges,
                           node_feat={"words": node_feature})
        perm = np.arange(0, num_nodes)
        #np.random.shuffle(perm)
        self.train_index = perm[:140]
        self.val_index = perm[200:500]
        self.test_index = perm[500:1500]
        self.y = np.array(y, dtype="int64")
        self.num_classes = len(y_dict)
Beispiel #9
0
    def _load_data(self):
        np.random.seed(self.np_random_seed)
        edge_path = os.path.join(self.path, 'ca-AstroPh.txt')

        bi_edges = set()
        self.neg_edges = []
        self.pos_edges = []
        self.node2id = dict()

        def node_id(node):
            if node not in self.node2id:
                self.node2id[node] = len(self.node2id)
            return self.node2id[node]

        with io.open(edge_path) as inf:
            for _ in range(4):
                inf.readline()
            for line in inf:
                u, v = line.strip('\n').split('\t')
                u, v = node_id(u), node_id(v)
                if u < v:
                    bi_edges.add((u, v))
                else:
                    bi_edges.add((v, u))

        num_nodes = len(self.node2id)

        while len(self.neg_edges) < len(bi_edges) // 2:
            random_edges = np.random.choice(num_nodes, [len(bi_edges), 2])
            for (u, v) in random_edges:
                if u != v and (u, v) not in bi_edges and (v,
                                                          u) not in bi_edges:
                    self.neg_edges.append((u, v))
                    if len(self.neg_edges) == len(bi_edges) // 2:
                        break

        bi_edges = list(bi_edges)
        np.random.shuffle(bi_edges)
        self.pos_edges = bi_edges[:len(bi_edges) // 2]
        bi_edges = bi_edges[len(bi_edges) // 2:]
        all_edges = []
        for edge in bi_edges:
            u, v = edge
            all_edges.append((u, v))
            all_edges.append((v, u))
        self.graph = Graph(num_nodes=num_nodes, edges=all_edges)
Beispiel #10
0
    def __init__(self,
                 num_nodes,
                 edges,
                 node_types=None,
                 node_feat=None,
                 edge_feat=None):
        self._num_nodes = num_nodes
        self._edges_dict = edges

        if isinstance(node_types, list):
            self._node_types = np.array(node_types, dtype=object)[:, 1]
        else:
            self._node_types = node_types

        self._nodes_type_dict = {}
        for n_type in np.unique(self._node_types):
            self._nodes_type_dict[n_type] = np.where(
                self._node_types == n_type)[0]

        if node_feat is not None:
            self._node_feat = node_feat
        else:
            self._node_feat = {}

        if edge_feat is not None:
            self._edge_feat = edge_feat
        else:
            self._edge_feat = {}

        self._multi_graph = {}

        for key, value in self._edges_dict.items():
            if not self._edge_feat:
                edge_feat = None
            else:
                edge_feat = self._edge_feat[key]

            self._multi_graph[key] = Graph(num_nodes=self._num_nodes,
                                           edges=value,
                                           node_feat=self._node_feat,
                                           edge_feat=edge_feat)

        self._edge_types = self.edge_types_info()
Beispiel #11
0
    def build_graph(self, x_batch):
        """build graph"""
        B, T, n, _ = x_batch.shape
        batch = B * T

        batch_edges = []
        for i in range(batch):
            batch_edges.append(self.edges + (i * n))
        batch_edges = np.vstack(batch_edges)

        num_nodes = B * T * n
        node_feat = {'norm': np.tile(self.norm, [batch, 1])}
        edge_feat = {'weights': np.tile(self.weights, [batch, 1])}
        graph = Graph(num_nodes=num_nodes,
                      edges=batch_edges,
                      node_feat=node_feat,
                      edge_feat=edge_feat)

        return graph
def add_self_loop(graph, sub_nodes=None):
    '''add_self_loop_for_subgraph
    '''
    assert not graph.is_tensor(), "You must call Graph.numpy() first."

    if sub_nodes is not None:
        self_loop_edges = np.zeros((sub_nodes.shape[0], 2))
        self_loop_edges[:, 0] = self_loop_edges[:, 1] = sub_nodes
    else:
        self_loop_edges = np.zeros((graph.num_nodes, 2))
        self_loop_edges[:, 0] = self_loop_edges[:,
                                                1] = np.arange(graph.num_nodes)
    edges = np.vstack((graph.edges, self_loop_edges))
    edges = np.unique(edges, axis=0)
    new_g = Graph(
        edges=edges,
        num_nodes=graph.num_nodes,
    )
    return new_g
Beispiel #13
0
    def _load_data(self):
        edge_path = os.path.join(self.path, 'edges.csv')
        node_path = os.path.join(self.path, 'nodes.csv')
        group_edge_path = os.path.join(self.path, 'group-edges.csv')

        all_edges = []

        with io.open(node_path) as inf:
            num_nodes = len(inf.readlines())

        node_feature = np.zeros((num_nodes, self.num_groups))

        with io.open(group_edge_path) as inf:
            for line in inf:
                node_id, group_id = line.strip('\n').split(',')
                node_id, group_id = int(node_id) - 1, int(group_id) - 1
                node_feature[node_id][group_id] = 1

        with io.open(edge_path) as inf:
            for line in inf:
                u, v = line.strip('\n').split(',')
                u, v = int(u) - 1, int(v) - 1
                all_edges.append((u, v))
                if self.symmetry_edges:
                    all_edges.append((v, u))

        if self.self_loop:
            for i in range(num_nodes):
                all_edges.append((i, i))

        all_edges = list(set(all_edges))
        self.graph = Graph(num_nodes=num_nodes,
                           edges=all_edges,
                           node_feat={"group_id": node_feature})

        perm = np.arange(0, num_nodes)
        np.random.shuffle(perm)
        train_num = int(num_nodes * 0.5)
        self.train_index = perm[:train_num]
        self.test_index = perm[train_num:]
Beispiel #14
0
    def _load_data(self, normalize=True, symmetry=True):
        from sklearn.preprocessing import StandardScaler
        import scipy.sparse as sp

        data = np.load(os.path.join(self.path, "reddit.npz"))
        adj = sp.load_npz(os.path.join(self.path, "reddit_adj.npz"))
        if symmetry:
            adj = adj + adj.T
        adj = adj.tocoo()
        src = adj.row
        dst = adj.col

        num_classes = 41
        train_label = data['y_train']
        val_label = data['y_val']
        test_label = data['y_test']

        train_index = data['train_index']
        val_index = data['val_index']
        test_index = data['test_index']

        feature = data["feats"].astype("float32")

        if normalize:
            scaler = StandardScaler()
            scaler.fit(feature[train_index])
            feature = scaler.transform(feature)

        graph = Graph(num_nodes=feature.shape[0], edges=list(zip(src, dst)))

        self.graph = graph
        self.train_index = train_index
        self.train_label = train_label
        self.val_label = val_label
        self.val_index = val_index
        self.test_index = test_index
        self.test_label = test_label
        self.feature = feature
        self.num_classes = 41
Beispiel #15
0
class GraphTest(unittest.TestCase):
    def setUp(self):
        num_nodes = 5
        edges = [(0, 1), (1, 2), (3, 4)]
        feature = np.random.randn(5, 100)
        edge_feature = np.random.randn(3, 100)
        self.graph = Graph(num_nodes=num_nodes,
                           edges=edges,
                           node_feat={"feature": feature},
                           edge_feat={"edge_feature": edge_feature})

    def test_subgraph_consistency(self):
        node_index = [0, 2, 3, 4]
        eid = [2]
        subgraph = self.graph.subgraph(node_index, eid)
        for key, value in subgraph.node_feat.items():
            diff = value - self.graph.node_feat[key][node_index]
            diff = np.sqrt(np.sum(diff * diff))
            self.assertLessEqual(diff, 1e-6)

        for key, value in subgraph.edge_feat.items():
            diff = value - self.graph.edge_feat[key][eid]
            diff = np.sqrt(np.sum(diff * diff))
            self.assertLessEqual(diff, 1e-6)
Beispiel #16
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        log.info(dataset.num_authors)
        log.info(dataset.num_papers)

        path = f'{dataset.dir}/author_feat.npy'
        t = time.perf_counter()
        if not osp.exists(path):
            log.info('get author_feat...')
            paper_feat = dataset.paper_feat
            # author
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            edge_index = np.stack([col, row], axis=1)
            log.info(edge_index.shape)
            author_graph = Graph(edge_index, num_nodes=dataset.num_authors)
            author_graph.tensor()
            log.info('finish author graph')

            author_x = np.memmap(path,
                                 dtype=np.float16,
                                 mode='w+',
                                 shape=(dataset.num_authors,
                                        self.num_features))
            dim_chunk_size = 64

            degree = paddle.zeros(shape=[dataset.num_authors, 1],
                                  dtype='float32')
            degree += 1e-10
            temp_one = paddle.ones(shape=[edge_index.shape[0], 1],
                                   dtype='float32')
            degree = scatter(degree,
                             author_graph.edges[:, 1],
                             temp_one,
                             overwrite=False)
            log.info('finish degree')

            for i in tqdm(range(0, self.num_features, dim_chunk_size)):
                j = min(i + dim_chunk_size, self.num_features)
                inputs = get_col_slice(paper_feat,
                                       start_row_idx=0,
                                       end_row_idx=dataset.num_papers,
                                       start_col_idx=i,
                                       end_col_idx=j)

                inputs = paddle.to_tensor(inputs, dtype='float32')
                outputs = author_graph.send_recv(inputs)
                outputs = outputs / degree
                outputs = outputs.astype('float16').numpy()

                del inputs
                save_col_slice(x_src=outputs,
                               x_dst=author_x,
                               start_row_idx=0,
                               end_row_idx=dataset.num_authors,
                               start_col_idx=i,
                               end_col_idx=j)
                del outputs

            author_x.flush()
            del author_x
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        graph_file_list = []
        paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split'
        graph_file_list.append(paper_edge_path)
        t = time.perf_counter()
        if not osp.exists(paper_edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]
            edge_index = np.vstack((edge_index, edges_new))
            edge_types = np.full([
                edge_index.shape[0],
            ], 0, dtype='int32')

            graph = Graph(edge_index,
                          num_nodes=dataset.num_papers,
                          edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(paper_edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 1, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 2, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 3, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 4, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        path = f'{dataset.dir}/full_feat.npy'

        author_feat_path = f'{dataset.dir}/author_feat.npy'

        institution_feat_path = f'{dataset.dir}/institution_feat.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat

            author_feat = np.memmap(author_feat_path,
                                    dtype=np.float16,
                                    shape=(dataset.num_authors,
                                           self.num_features),
                                    mode='r')

            institution_feat = np.memmap(institution_feat_path,
                                         dtype=np.float16,
                                         shape=(dataset.num_institutions,
                                                self.num_features),
                                         mode='r')

            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_feat[i:j]
            del paper_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_feat[i - start_idx:j - start_idx]
            del author_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_feat[i - start_idx:j - start_idx]
            del institution_feat

            x.flush()
            del x
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        np.random.seed(self.seed)
        self.train_idx = dataset.get_idx_split('train')
        self.val_idx = dataset.get_idx_split('valid')
        valid_name = os.path.join(self.valid_path, self.valid_name)
        self.val_idx_cv = np.load(valid_name)
        log.info(self.train_idx.shape)
        log.info(self.val_idx.shape)
        log.info(self.val_idx_cv.shape)
        self.test_idx = dataset.get_idx_split('test')

        ##self.val_idx = np.load('valid_idx_eval.npy')
        def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
            def cal_angle(position, hid_idx):
                return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)

            def get_posi_angle_vec(position):
                return [cal_angle(position, hid_j) for hid_j in range(d_hid)]

            sinusoid_table = np.array(
                [get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
            sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
            sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:,
                                                            1::2])  # dim 2i+1
            return sinusoid_table

        N = dataset.num_papers + dataset.num_authors + dataset.num_institutions
        self.x = np.memmap(f'{dataset.dir}/full_feat.npy',
                           dtype=np.float16,
                           mode='r',
                           shape=(N, self.num_features))

        self.id_x = np.memmap(f'{dataset.dir}/{self.m2v_file}',
                              dtype=np.float16,
                              mode='r',
                              shape=(N, self.m2v_dim))

        self.y = dataset.all_paper_label

        self.graph = [
            Graph.load(edge_path, mmap_mode='r+')
            for edge_path in graph_file_list
        ]

        self.pos = get_sinusoid_encoding_table(200, 768)
        #self.year = dataset.all_paper_year
        year_file = f'{dataset.dir}/all_feat_year.npy'
        self.year = np.memmap(year_file, dtype=np.int32, mode='r', shape=(N, ))
        self.num_papers = dataset.num_papers
        self.train_idx_label = None
        self.train_idx_data = None
        log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Beispiel #18
0
    def _load_data(self):
        """Load data
        """
        import networkx as nx
        objnames = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
        objects = []
        for i in range(len(objnames)):
            with open("{}/ind.{}.{}".format(self.path, self.name, objnames[i]),
                      'rb') as f:
                objects.append(_pickle_load(f))

        x, y, tx, ty, allx, ally, _graph = objects
        test_idx_reorder = _parse_index_file("{}/ind.{}.test.index".format(
            self.path, self.name))
        test_idx_range = np.sort(test_idx_reorder)

        allx = allx.todense()
        tx = tx.todense()
        if self.name == 'citeseer':
            # Fix citeseer dataset (there are some isolated nodes in the graph)
            # Find isolated nodes, add them as zero-vecs into the right position
            test_idx_range_full = range(min(test_idx_reorder),
                                        max(test_idx_reorder) + 1)
            tx_extended = np.zeros((len(test_idx_range_full), x.shape[1]),
                                   dtype="float32")
            tx_extended[test_idx_range - min(test_idx_range), :] = tx
            tx = tx_extended
            ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]),
                                   dtype="float32")
            ty_extended[test_idx_range - min(test_idx_range), :] = ty
            ty = ty_extended

        features = np.vstack([allx, tx])
        features[test_idx_reorder, :] = features[test_idx_range, :]
        features = features / (np.sum(features, axis=-1) + 1e-15)
        features = np.array(features, dtype="float32")
        _graph = nx.DiGraph(nx.from_dict_of_lists(_graph))

        onehot_labels = np.vstack((ally, ty))
        onehot_labels[test_idx_reorder, :] = onehot_labels[test_idx_range, :]
        labels = np.argmax(onehot_labels, 1)

        idx_test = test_idx_range.tolist()
        idx_train = range(len(y))
        idx_val = range(len(y), len(y) + 500)
        all_edges = []
        for i in _graph.edges():
            u, v = tuple(i)
            all_edges.append((u, v))
            if self.symmetry_edges:
                all_edges.append((v, u))

        if self.self_loop:
            for i in range(_graph.number_of_nodes()):
                all_edges.append((i, i))
        all_edges = list(set(all_edges))

        self.graph = Graph(num_nodes=_graph.number_of_nodes(),
                           edges=all_edges,
                           node_feat={"words": features})
        self.y = np.array(labels, dtype="int64")
        self.num_classes = onehot_labels.shape[1]
        self.train_index = np.array(idx_train, dtype="int32")
        self.val_index = np.array(idx_val, dtype="int32")
        self.test_index = np.array(idx_test, dtype="int32")
Beispiel #19
0
def build_graph(num_nodes, edge_path, output_path, undigraph=True):
    """ build_graph
    """
    edge_file = os.path.join(output_path, "edge.npy")
    edge_weight_file = os.path.join(output_path, "edge_weight.npy")
    alias_file = os.path.join(output_path, "alias.npy")
    events_file = os.path.join(output_path, "events.npy")
    if os.path.isfile(edge_file):
        edges = np.load(edge_file)
        edge_feat = dict()
        if os.path.isfile(edge_weight_file):
            log.info("Loading weight from cache")
            edge_feat["weight"] = np.load(edge_weight_file, allow_pickle=True)
        node_feat = dict()
        if os.path.isfile(alias_file):
            log.info("Loading alias from cache")
            node_feat["alias"] = np.load(alias_file, allow_pickle=True)
        if os.path.isfile(events_file):
            log.info("Loading events from cache")
            node_feat["events"] = np.load(events_file, allow_pickle=True)
    else:
        filelist = get_file_list(edge_path)
        edges, edge_weight = [], []
        log.info("Reading edge files")
        for name in filelist:
            with open(name) as inf:
                for line in inf:
                    slots = line.strip("\n").split()
                    edges.append([slots[0], slots[1]])
                    if len(slots) > 2:
                        edge_weight.append(slots[2])
        edges = np.array(edges, dtype="int64")
        assert num_nodes > edges.max(
        ), "Node id in any edges should be smaller then num_nodes!"

        log.info("Read edge files done.")
        edge_feat = dict()
        node_feat = dict()
        if len(edge_weight) == len(edges):
            edge_feat["weight"] = np.array(edge_weight, dtype="float32")

    if undigraph is True:
        edges = np.concatenate([edges, edges[:, [1, 0]]], 0)
        if "weight" in edge_feat:
            edge_feat["weight"] = np.concatenate(
                [edge_feat["weight"], edge_feat["weight"]],
                0).astype("float64")

    graph = Graph(num_nodes, edges, node_feat, edge_feat=edge_feat)
    log.info("Build graph done")
    graph.outdegree()
    log.info("Build graph index done")
    if "weight" in graph.edge_feat and "alias" not in graph.node_feat and "events" not in graph.node_feat:
        graph.node_feat["alias"], graph.node_feat[
            "events"] = graph_alias_sample_table(graph, "weight")
        log.info(
            "Build graph alias sample table done, and saving alias & evnets cache"
        )
        np.save(alias_file, graph.node_feat["alias"])
        np.save(events_file, graph.node_feat["events"])
    return graph
Beispiel #20
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl'
        t = time.perf_counter()
        if not osp.exists(paper_edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]

            edge_index = np.vstack((edge_index, edges_new))
            #            edge_index = np.unique(edge_index, axis=0)

            graph = Graph(edge_index)
            graph.adj_dst_index
            graph.dump(paper_edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        edge_path = f'{dataset.dir}/full_edge_symmetric_pgl'
        t = time.perf_counter()
        if not osp.exists(edge_path):
            log.info('Converting adjacency matrix...')

            # paper
            log.info('adding paper edges')
            paper_graph = Graph.load(paper_edge_path, mmap_mode='r+')
            rows, cols = [paper_graph.edges[:, 0]], [paper_graph.edges[:, 1]]

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            row += dataset.num_papers
            rows += [row, col]
            cols += [col, row]

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors
            rows += [row, col]
            cols += [col, row]

            # edge_type
            log.info('building edge type')
            edge_types = [
                np.full(x.shape, i, dtype='int32') for i, x in enumerate(rows)
            ]
            edge_types = np.concatenate(edge_types, axis=0)

            log.info('building edges')
            row = np.concatenate(rows, axis=0)
            del rows

            col = np.concatenate(cols, axis=0)
            del cols

            edge_index = np.stack([row, col], axis=1)
            N = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            full_graph = Graph(edge_index,
                               num_nodes=N,
                               edge_feat={'edge_type': edge_types})
            full_graph.adj_dst_index
            full_graph.dump(edge_path)
            log.info(
                f'Done! finish full_edge [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/full_feat.npy'

        author_feat_path = f'{dataset.dir}/author_feat.npy'

        institution_feat_path = f'{dataset.dir}/institution_feat.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat

            author_feat = np.memmap(author_feat_path,
                                    dtype=np.float16,
                                    shape=(dataset.num_authors,
                                           self.num_features),
                                    mode='r')

            institution_feat = np.memmap(institution_feat_path,
                                         dtype=np.float16,
                                         shape=(dataset.num_institutions,
                                                self.num_features),
                                         mode='r')

            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_feat[i:j]
            del paper_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_feat[i - start_idx:j - start_idx]
            del author_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_feat[i - start_idx:j - start_idx]
            del institution_feat

            x.flush()
            del x
            print(f'Done! [{time.perf_counter() - t:.2f}s]')

        np.random.seed(self.seed)
        self.train_idx = dataset.get_idx_split('train')
        np.random.shuffle(self.train_idx)

        self.val_idx = dataset.get_idx_split('valid')
        self.test_idx = dataset.get_idx_split('test')

        N = dataset.num_papers + dataset.num_authors + dataset.num_institutions
        self.x = np.memmap(f'{dataset.dir}/full_feat.npy',
                           dtype=np.float16,
                           mode='r',
                           shape=(N, self.num_features))

        self.y = dataset.all_paper_label

        self.graph = Graph.load(edge_path, mmap_mode='r+')
        self.graph._edge_feat['edge_type'] = self.graph._edge_feat[
            'edge_type'].astype('int32')

        log.info(f'Done! [{time.perf_counter() - t:.2f}s]')
Beispiel #21
0
    def prepare_data(self):
        dataset = MAG240MDataset(self.data_dir)

        graph_file_list = []
        paper_edge_path = f'{dataset.dir}/paper_to_paper_symmetric_pgl_split'
        graph_file_list.append(paper_edge_path)
        t = time.perf_counter()
        if not osp.exists(paper_edge_path):
            log.info('Converting adjacency matrix...')
            edge_index = dataset.edge_index('paper', 'cites', 'paper')
            edge_index = edge_index.T

            edges_new = np.zeros((edge_index.shape[0], 2))
            edges_new[:, 0] = edge_index[:, 1]
            edges_new[:, 1] = edge_index[:, 0]
            edge_index = np.vstack((edge_index, edges_new))
            edge_types = np.full([
                edge_index.shape[0],
            ], 0, dtype='int32')

            graph = Graph(edge_index,
                          num_nodes=dataset.num_papers,
                          edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(paper_edge_path)
            log.info(f'Done! [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_src'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 1, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        author_edge_path = f'{dataset.dir}/paper_to_author_symmetric_pgl_split_dst'
        graph_file_list.append(author_edge_path)
        t = time.perf_counter()
        if not osp.exists(author_edge_path):
            log.info('Converting author matrix...')

            # author
            log.info('adding author edges')
            edge_index = dataset.edge_index('author', 'writes', 'paper')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers

            edge_types = np.full(row.shape, 2, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(author_edge_path)
            log.info(
                f'Done! finish author_edge [{time.perf_counter() - t:.2f}s]')

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_src'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 3, dtype='int32')
            edge_index = np.stack([row, col], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        institution_edge_path = f'{dataset.dir}/institution_edge_symmetric_pgl_split_dst'
        graph_file_list.append(institution_edge_path)
        t = time.perf_counter()
        if not osp.exists(institution_edge_path):
            log.info('Converting institution matrix...')

            # institution
            log.info('adding institution edges')
            edge_index = dataset.edge_index('author', 'institution')
            edge_index = edge_index.T
            row, col = edge_index[:, 0], edge_index[:, 1]
            log.info(row[:10])
            row += dataset.num_papers
            col += dataset.num_papers + dataset.num_authors

            # edge_type
            log.info('building edge type')
            edge_types = np.full(row.shape, 4, dtype='int32')
            edge_index = np.stack([col, row], axis=1)

            graph = Graph(edge_index, edge_feat={'edge_type': edge_types})
            graph.adj_dst_index
            graph.dump(institution_edge_path)
            log.info(
                f'Done! finish institution_edge [{time.perf_counter() - t:.2f}s]'
            )

        path = f'{dataset.dir}/full_feat.npy'

        author_feat_path = f'{dataset.dir}/author_feat.npy'

        institution_feat_path = f'{dataset.dir}/institution_feat.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full feature matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_feat = dataset.paper_feat

            author_feat = np.memmap(author_feat_path,
                                    dtype=np.float16,
                                    shape=(dataset.num_authors,
                                           self.num_features),
                                    mode='r')

            institution_feat = np.memmap(institution_feat_path,
                                         dtype=np.float16,
                                         shape=(dataset.num_institutions,
                                                self.num_features),
                                         mode='r')

            x = np.memmap(path,
                          dtype=np.float16,
                          mode='w+',
                          shape=(N, self.num_features))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_feat[i:j]
            del paper_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_feat[i - start_idx:j - start_idx]
            del author_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_feat[i - start_idx:j - start_idx]
            del institution_feat

            x.flush()
            del x
            print(f'feature x Done! [{time.perf_counter() - t:.2f}s]')

        path = f'{dataset.dir}/all_feat_year.npy'

        author_year_path = f'{dataset.dir}/author_feat_year.npy'

        institution_year_path = f'{dataset.dir}/institution_feat_year.npy'

        t = time.perf_counter()
        if not osp.exists(path):  # Will take ~3 hours...
            print('Generating full year matrix...')

            node_chunk_size = 100000
            N = (dataset.num_papers + dataset.num_authors +
                 dataset.num_institutions)

            paper_year_feat = dataset.all_paper_year

            author_year_feat = np.memmap(author_year_path,
                                         dtype=np.int32,
                                         shape=(dataset.num_authors),
                                         mode='r')

            institution_year_feat = np.memmap(institution_year_path,
                                              dtype=np.int32,
                                              shape=(dataset.num_institutions),
                                              mode='r')

            x = np.memmap(path, dtype=np.int32, mode='w+', shape=(N))

            print('Copying paper features...')
            start_idx = 0
            end_idx = dataset.num_papers
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = paper_year_feat[i:j]
            del paper_year_feat

            print('Copying author feature...')
            start_idx = dataset.num_papers
            end_idx = dataset.num_papers + dataset.num_authors
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = author_year_feat[i - start_idx:j - start_idx]
            del author_year_feat

            print('Copying institution feature...')
            start_idx = dataset.num_papers + dataset.num_authors
            end_idx = dataset.num_papers + dataset.num_authors + dataset.num_institutions
            for i in tqdm(range(start_idx, end_idx, node_chunk_size)):
                j = min(i + node_chunk_size, end_idx)
                x[i:j] = institution_year_feat[i - start_idx:j - start_idx]
            del institution_year_feat

            x.flush()
            del x
            print(f'year feature Done! [{time.perf_counter() - t:.2f}s]')