Beispiel #1
0
def create_test_heterograph():
    # test heterograph from the docstring, plus a user -- wishes -- game relation
    # 3 users, 2 games, 2 developers
    # metagraph:
    #    ('user', 'follows', 'user'),
    #    ('user', 'plays', 'game'),
    #    ('user', 'wishes', 'game'),
    #    ('developer', 'develops', 'game')])

    plays_spmat = ssp.coo_matrix(([1, 1, 1, 1], ([0, 1, 2, 1], [0, 0, 1, 1])))
    wishes_nx = nx.DiGraph()
    wishes_nx.add_nodes_from(['u0', 'u1', 'u2'], bipartite=0)
    wishes_nx.add_nodes_from(['g0', 'g1'], bipartite=1)
    wishes_nx.add_edge('u0', 'g1', id=0)
    wishes_nx.add_edge('u2', 'g0', id=1)

    follows_g = dgl.graph([(0, 1), (1, 2)], 'user', 'follows')
    plays_g = dgl.bipartite(plays_spmat, 'user', 'plays', 'game')
    wishes_g = dgl.bipartite(wishes_nx, 'user', 'wishes', 'game')
    develops_g = dgl.bipartite([(0, 0), (1, 1)], 'developer', 'develops', 'game')
    g = dgl.hetero_from_relations([follows_g, plays_g, wishes_g, develops_g])
    return g
Beispiel #2
0
def create_heterographs2(index_dtype):
    g_x = dgl.graph(([0, 1, 2], [1, 2, 3]),
                    'user',
                    'follows',
                    index_dtype=index_dtype,
                    restrict_format='any')
    g_y = dgl.graph(([0, 2], [2, 3]),
                    'user',
                    'knows',
                    index_dtype=index_dtype,
                    restrict_format='csr')
    g_z = dgl.bipartite(([0, 1, 3], [2, 3, 4]),
                        'user',
                        'knows',
                        'knowledge',
                        index_dtype=index_dtype)
    g_x.nodes['user'].data['h'] = F.randn((4, 3))
    g_x.edges['follows'].data['w'] = F.randn((3, 2))
    g_y.nodes['user'].data['hh'] = F.ones((4, 5))
    g_y.edges['knows'].data['ww'] = F.randn((2, 10))
    g = dgl.hetero_from_relations([g_x, g_y, g_z])
    return [g, g_x, g_y, g_z]
Beispiel #3
0
def test_in_subgraph(index_dtype):
    g1 = dgl.graph([(1,0),(2,0),(3,0),(0,1),(2,1),(3,1),(0,2)], 'user', 'follow', index_dtype=index_dtype)
    g2 = dgl.bipartite([(0,0),(0,1),(1,2),(3,2)], 'user', 'play', 'game', index_dtype=index_dtype)
    g3 = dgl.bipartite([(2,0),(2,1),(2,2),(1,0),(1,3),(0,0)], 'game', 'liked-by', 'user', index_dtype=index_dtype)
    g4 = dgl.bipartite([(0,0),(1,0),(2,0),(3,0)], 'user', 'flips', 'coin', index_dtype=index_dtype)
    hg = dgl.hetero_from_relations([g1, g2, g3, g4])
    subg = dgl.in_subgraph(hg, {'user' : [0,1], 'game' : 0})
    assert subg._idtype_str == index_dtype
    assert len(subg.ntypes) == 3
    assert len(subg.etypes) == 4
    u, v = subg['follow'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['follow'].edge_ids(u, v), subg['follow'].edata[dgl.EID])
    assert edge_set == {(1,0),(2,0),(3,0),(0,1),(2,1),(3,1)}
    u, v = subg['play'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['play'].edge_ids(u, v), subg['play'].edata[dgl.EID])
    assert edge_set == {(0,0)}
    u, v = subg['liked-by'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['liked-by'].edge_ids(u, v), subg['liked-by'].edata[dgl.EID])
    assert edge_set == {(2,0),(2,1),(1,0),(0,0)}
    assert subg['flips'].number_of_edges() == 0
Beispiel #4
0
def test_metapath_random_walk(idtype):
    g1 = dgl.bipartite(([0, 1, 2, 3], [0, 1, 2, 3]),
                       'a',
                       'ab',
                       'b',
                       idtype=idtype)
    g2 = dgl.bipartite(([0, 0, 1, 1, 2, 2, 3, 3], [1, 3, 2, 0, 3, 1, 0, 2]),
                       'b',
                       'ba',
                       'a',
                       idtype=idtype)
    G = dgl.hetero_from_relations([g1, g2])
    seeds = [0, 1]
    traces = dgl.contrib.sampling.metapath_random_walk(G, ['ab', 'ba'] * 4,
                                                       seeds, 3)
    for seed, traces_per_seed in zip(seeds, traces):
        assert len(traces_per_seed) == 3
        for trace in traces_per_seed:
            assert len(trace) == 8
            trace = np.insert(F.asnumpy(trace), 0, seed)
            for i in range(4):
                assert g1.has_edge_between(trace[2 * i], trace[2 * i + 1])
                assert g2.has_edge_between(trace[2 * i + 1], trace[2 * i + 2])
Beispiel #5
0
def _gen_neighbor_sampling_test_graph(hypersparse, reverse):
    if hypersparse:
        # should crash if allocated a CSR
        card = 1 << 50
        card2 = (1 << 50, 1 << 50)
    else:
        card = None
        card2 = None

    if reverse:
        g = dgl.graph([(0, 1), (0, 2), (0, 3), (1, 0), (1, 2), (1, 3), (2, 0)],
                      'user',
                      'follow',
                      num_nodes=card)
        g.edata['prob'] = F.tensor([.5, .5, 0., .5, .5, 0., 1.],
                                   dtype=F.float32)
        g1 = dgl.bipartite([(0, 0), (1, 0), (2, 1), (2, 3)],
                           'game',
                           'play',
                           'user',
                           num_nodes=card2)
        g1.edata['prob'] = F.tensor([.8, .5, .5, .5], dtype=F.float32)
        g2 = dgl.bipartite([(0, 2), (1, 2), (2, 2), (0, 1), (3, 1), (0, 0)],
                           'user',
                           'liked-by',
                           'game',
                           num_nodes=card2)
        g2.edata['prob'] = F.tensor([.3, .5, .2, .5, .1, .1], dtype=F.float32)
        g3 = dgl.bipartite([(0, 0), (0, 1), (0, 2), (0, 3)],
                           'coin',
                           'flips',
                           'user',
                           num_nodes=card2)

        hg = dgl.hetero_from_relations([g, g1, g2, g3])
    else:
        g = dgl.graph([(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1), (0, 2)],
                      'user',
                      'follow',
                      num_nodes=card)
        g.edata['prob'] = F.tensor([.5, .5, 0., .5, .5, 0., 1.],
                                   dtype=F.float32)
        g1 = dgl.bipartite([(0, 0), (0, 1), (1, 2), (3, 2)],
                           'user',
                           'play',
                           'game',
                           num_nodes=card2)
        g1.edata['prob'] = F.tensor([.8, .5, .5, .5], dtype=F.float32)
        g2 = dgl.bipartite([(2, 0), (2, 1), (2, 2), (1, 0), (1, 3), (0, 0)],
                           'game',
                           'liked-by',
                           'user',
                           num_nodes=card2)
        g2.edata['prob'] = F.tensor([.3, .5, .2, .5, .1, .1], dtype=F.float32)
        g3 = dgl.bipartite([(0, 0), (1, 0), (2, 0), (3, 0)],
                           'user',
                           'flips',
                           'coin',
                           num_nodes=card2)

        hg = dgl.hetero_from_relations([g, g1, g2, g3])
    return g, hg
Beispiel #6
0
    def sample_blocks(self, seeds):
        """Sample subgraphs from the entire graph.

        The input ``seeds`` represents the edges to compute prediction for. The sampling
        algorithm works as follows:

          1. Get the head and tail nodes of the provided seed edges.
          2. For each head and tail node, extract the entire in-coming neighborhood.
          3. Copy the node features/embeddings from the full graph to the sampled subgraphs.
        """
        dataset = self.dataset
        enc_graph = self.enc_graph
        dec_graph = self.dec_graph
        edge_ids = th.stack(seeds)
        # generate frontiers for user and item
        possible_rating_values = dataset.possible_rating_values
        true_relation_ratings = self.truths[edge_ids]
        true_relation_labels = None if self.labels is None else self.labels[
            edge_ids]

        # 1. Get the head and tail nodes from both the decoder and encoder graphs.
        head_id, tail_id = dec_graph.find_edges(edge_ids)
        utype, _, vtype = enc_graph.canonical_etypes[0]
        subg = []
        true_rel_ratings = []
        true_rel_labels = []
        for possible_rating_value in possible_rating_values:
            idx_loc = (true_relation_ratings == possible_rating_value)
            head = head_id[idx_loc]
            tail = tail_id[idx_loc]
            true_rel_ratings.append(true_relation_ratings[idx_loc])
            if self.labels is not None:
                true_rel_labels.append(true_relation_labels[idx_loc])
            subg.append(
                dgl.bipartite((head, tail),
                              utype=utype,
                              etype=str(possible_rating_value),
                              vtype=vtype,
                              num_nodes=(enc_graph.number_of_nodes(utype),
                                         enc_graph.number_of_nodes(vtype))))
        # Convert the encoder subgraph to a more compact one by removing nodes that covered
        # by the seed edges.
        g = dgl.hetero_from_relations(subg)
        g = dgl.compact_graphs(g)

        # 2. For each head and tail node, extract the entire in-coming neighborhood.
        seed_nodes = {}
        for ntype in g.ntypes:
            seed_nodes[ntype] = g.nodes[ntype].data[dgl.NID]
        frontier = dgl.in_subgraph(enc_graph, seed_nodes)
        frontier = dgl.to_block(frontier, seed_nodes)

        # 3. Copy the node features/embeddings from the full graph to the sampled subgraphs.
        frontier.dstnodes['user'].data['ci'] = \
            enc_graph.nodes['user'].data['ci'][frontier.dstnodes['user'].data[dgl.NID]]
        frontier.srcnodes['movie'].data['cj'] = \
            enc_graph.nodes['movie'].data['cj'][frontier.srcnodes['movie'].data[dgl.NID]]
        frontier.srcnodes['user'].data['cj'] = \
            enc_graph.nodes['user'].data['cj'][frontier.srcnodes['user'].data[dgl.NID]]
        frontier.dstnodes['movie'].data['ci'] = \
            enc_graph.nodes['movie'].data['ci'][frontier.dstnodes['movie'].data[dgl.NID]]

        # handle features
        head_feat = frontier.srcnodes['user'].data[dgl.NID].long() \
                    if dataset.user_feature is None else \
                       dataset.user_feature[frontier.srcnodes['user'].data[dgl.NID]]
        tail_feat = frontier.srcnodes['movie'].data[dgl.NID].long()\
                    if dataset.movie_feature is None else \
                       dataset.movie_feature[frontier.srcnodes['movie'].data[dgl.NID]]

        true_rel_labels = None if self.labels is None else th.cat(
            true_rel_labels, dim=0)
        true_rel_ratings = th.cat(true_rel_ratings, dim=0)
        return (g, frontier, head_feat, tail_feat, true_rel_labels,
                true_rel_ratings)
Beispiel #7
0
def construct_graph():
    paper_ids = []
    paper_names = []
    author_ids = []
    author_names = []
    conf_ids = []
    conf_names = []
    f_3 = open(os.path.join(path, "id_author.txt"), encoding="ISO-8859-1")
    f_4 = open(os.path.join(path, "id_conf.txt"), encoding="ISO-8859-1")
    f_5 = open(os.path.join(path, "paper.txt"), encoding="ISO-8859-1")
    while True:
        z = f_3.readline()
        if not z:
            break
        z = z.strip().split()
        identity = int(z[0])
        author_ids.append(identity)
        author_names.append(z[1])
    while True:
        w = f_4.readline()
        if not w:
            break
        w = w.strip().split()
        identity = int(w[0])
        conf_ids.append(identity)
        conf_names.append(w[1])
    while True:
        v = f_5.readline()
        if not v:
            break
        v = v.strip().split()
        identity = int(v[0])
        paper_name = 'p' + ''.join(v[1:])
        paper_ids.append(identity)
        paper_names.append(paper_name)
    f_3.close()
    f_4.close()
    f_5.close()

    author_ids_invmap = {x: i for i, x in enumerate(author_ids)}
    conf_ids_invmap = {x: i for i, x in enumerate(conf_ids)}
    paper_ids_invmap = {x: i for i, x in enumerate(paper_ids)}

    paper_author_src = []
    paper_author_dst = []
    paper_conf_src = []
    paper_conf_dst = []
    f_1 = open(os.path.join(path, "paper_author.txt"), "r")
    f_2 = open(os.path.join(path, "paper_conf.txt"), "r")
    for x in f_1:
        x = x.split('\t')
        x[0] = int(x[0])
        x[1] = int(x[1].strip('\n'))
        paper_author_src.append(paper_ids_invmap[x[0]])
        paper_author_dst.append(author_ids_invmap[x[1]])
    for y in f_2:
        y = y.split('\t')
        y[0] = int(y[0])
        y[1] = int(y[1].strip('\n'))
        paper_conf_src.append(paper_ids_invmap[y[0]])
        paper_conf_dst.append(conf_ids_invmap[y[1]])
    f_1.close()
    f_2.close()

    pa = dgl.bipartite((paper_author_src, paper_author_dst), 'paper', 'pa',
                       'author')
    ap = dgl.bipartite((paper_author_dst, paper_author_src), 'author', 'ap',
                       'paper')
    pc = dgl.bipartite((paper_conf_src, paper_conf_dst), 'paper', 'pc', 'conf')
    cp = dgl.bipartite((paper_conf_dst, paper_conf_src), 'conf', 'cp', 'paper')
    hg = dgl.hetero_from_relations([pa, ap, pc, cp])
    return hg, author_names, conf_names, paper_names
Beispiel #8
0
def test_flatten():
    def check_mapping(g, fg):
        if len(fg.ntypes) == 1:
            SRC = DST = fg.ntypes[0]
        else:
            SRC = fg.ntypes[0]
            DST = fg.ntypes[1]

        etypes = F.asnumpy(fg.edata[dgl.ETYPE]).tolist()
        eids = F.asnumpy(fg.edata[dgl.EID]).tolist()

        for i, (etype, eid) in enumerate(zip(etypes, eids)):
            src_g, dst_g = g.find_edges([eid], g.canonical_etypes[etype])
            src_fg, dst_fg = fg.find_edges([i])
            # TODO(gq): I feel this code is quite redundant; can we just add new members (like
            # "induced_srcid") to returned heterograph object and not store them as features?
            assert src_g == fg.nodes[SRC].data[dgl.NID][src_fg]
            tid = F.asnumpy(fg.nodes[SRC].data[dgl.NTYPE][src_fg])[0]
            assert g.canonical_etypes[etype][0] == g.ntypes[tid]
            assert dst_g == fg.nodes[DST].data[dgl.NID][dst_fg]
            tid = F.asnumpy(fg.nodes[DST].data[dgl.NTYPE][dst_fg])[0]
            assert g.canonical_etypes[etype][2] == g.ntypes[tid]

    # check for wildcard slices
    g = create_test_heterograph()
    g.nodes['user'].data['h'] = F.ones((3, 5))
    g.nodes['game'].data['i'] = F.ones((2, 5))
    g.edges['plays'].data['e'] = F.ones((4, 4))
    g.edges['wishes'].data['e'] = F.ones((2, 4))
    g.edges['wishes'].data['f'] = F.ones((2, 4))

    fg = g['user', :, 'game']  # user--plays->game and user--wishes->game
    assert len(fg.ntypes) == 2
    assert fg.ntypes == ['user', 'game']
    assert fg.etypes == ['plays+wishes']

    assert F.array_equal(fg.nodes['user'].data['h'], F.ones((3, 5)))
    assert F.array_equal(fg.nodes['game'].data['i'], F.ones((2, 5)))
    assert F.array_equal(fg.edata['e'], F.ones((6, 4)))
    assert 'f' not in fg.edata

    etypes = F.asnumpy(fg.edata[dgl.ETYPE]).tolist()
    eids = F.asnumpy(fg.edata[dgl.EID]).tolist()
    assert set(zip(etypes, eids)) == set([(1, 0), (1, 1), (1, 2), (1, 3),
                                          (2, 0), (2, 1)])

    check_mapping(g, fg)

    fg = g['user', :, 'user']
    # NOTE(gq): The node/edge types from the parent graph is returned if there is only one
    # node/edge type.  This differs from the behavior above.
    assert fg.ntypes == ['user']
    assert fg.etypes == ['follows']
    u1, v1 = g.edges(etype='follows', order='eid')
    u2, v2 = fg.edges(etype='follows', order='eid')
    assert F.array_equal(u1, u2)
    assert F.array_equal(v1, v2)

    fg = g['developer', :, 'game']
    assert fg.ntypes == ['developer', 'game']
    assert fg.etypes == ['develops']
    u1, v1 = g.edges(etype='develops', order='eid')
    u2, v2 = fg.edges(etype='develops', order='eid')
    assert F.array_equal(u1, u2)
    assert F.array_equal(v1, v2)

    fg = g[:, :, :]
    assert fg.ntypes == ['developer+user', 'game+user']
    assert fg.etypes == ['develops+follows+plays+wishes']
    check_mapping(g, fg)

    # Test another heterograph
    g_x = dgl.graph(([0, 1, 2], [1, 2, 3]), 'user', 'follows')
    g_y = dgl.graph(([0, 2], [2, 3]), 'user', 'knows')
    g_x.nodes['user'].data['h'] = F.randn((4, 3))
    g_x.edges['follows'].data['w'] = F.randn((3, 2))
    g_y.nodes['user'].data['hh'] = F.randn((4, 5))
    g_y.edges['knows'].data['ww'] = F.randn((2, 10))
    g = dgl.hetero_from_relations([g_x, g_y])

    assert F.array_equal(g.ndata['h'], g_x.ndata['h'])
    assert F.array_equal(g.ndata['hh'], g_y.ndata['hh'])
    assert F.array_equal(g.edges['follows'].data['w'], g_x.edata['w'])
    assert F.array_equal(g.edges['knows'].data['ww'], g_y.edata['ww'])

    fg = g['user', :, 'user']
    assert fg.ntypes == ['user']
    assert fg.etypes == ['follows+knows']
    check_mapping(g, fg)

    fg = g['user', :, :]
    assert fg.ntypes == ['user']
    assert fg.etypes == ['follows+knows']
    check_mapping(g, fg)
Beispiel #9
0
import dgl

if __name__ == "__main__":
    follows_g = dgl.graph([(0, 1), (1, 2)], 'user', 'follows')
    devs_g = dgl.bipartite([(0, 0), (1, 1)], 'developer', 'develops', 'game')
    hetero_g = dgl.hetero_from_relations([follows_g, devs_g])
    homo_g = dgl.to_homo(hetero_g)

    hetero_g_2 = dgl.to_hetero(homo_g, hetero_g.ntypes, hetero_g.etypes)

    print(hetero_g)
    print(hetero_g_2)
    print("here")
Beispiel #10
0
    def _generate_enc_graph(self,
                            rating_pairs,
                            rating_values,
                            add_support=False):
        user_movie_R = np.zeros((self._num_user, self._num_movie),
                                dtype=np.float32)
        user_movie_R[rating_pairs] = rating_values
        movie_user_R = user_movie_R.transpose()

        rating_graphs = []
        rating_row, rating_col = rating_pairs
        for rating in self.possible_rating_values:
            ridx = np.where(rating_values == rating)
            rrow = rating_row[ridx]
            rcol = rating_col[ridx]
            bg = dgl.bipartite((rrow, rcol),
                               'user',
                               str(rating),
                               'movie',
                               num_nodes=(self._num_user, self._num_movie))
            rev_bg = dgl.bipartite((rcol, rrow),
                                   'movie',
                                   'rev-%s' % str(rating),
                                   'user',
                                   num_nodes=(self._num_movie, self._num_user))
            rating_graphs.append(bg)
            rating_graphs.append(rev_bg)
        graph = dgl.hetero_from_relations(rating_graphs)

        # sanity check
        assert len(rating_pairs[0]) == sum(
            [graph.number_of_edges(et) for et in graph.etypes]) // 2

        if add_support:

            def _calc_norm(x):
                x = x.asnumpy().astype('float32')
                x[x == 0.] = np.inf
                x = mx.nd.array(1. / np.sqrt(x))
                return x.as_in_context(self._ctx).expand_dims(1)

            user_ci = []
            user_cj = []
            movie_ci = []
            movie_cj = []
            for r in self.possible_rating_values:
                r = str(r)
                user_ci.append(graph['rev-%s' % r].in_degrees())
                movie_ci.append(graph[r].in_degrees())
                if self._symm:
                    user_cj.append(graph[r].out_degrees())
                    movie_cj.append(graph['rev-%s' % r].out_degrees())
                else:
                    user_cj.append(mx.nd.zeros((self.num_user, )))
                    movie_cj.append(mx.nd.zeros((self.num_movie, )))
            user_ci = _calc_norm(mx.nd.add_n(*user_ci))
            movie_ci = _calc_norm(mx.nd.add_n(*movie_ci))
            if self._symm:
                user_cj = _calc_norm(mx.nd.add_n(*user_cj))
                movie_cj = _calc_norm(mx.nd.add_n(*movie_cj))
            else:
                user_cj = mx.nd.ones((self.num_user, ), ctx=self._ctx)
                movie_cj = mx.nd.ones((self.num_movie, ), ctx=self._ctx)
            graph.nodes['user'].data.update({'ci': user_ci, 'cj': user_cj})
            graph.nodes['movie'].data.update({'ci': movie_ci, 'cj': movie_cj})

        return graph
Beispiel #11
0
    mask[indices] = 1
    return mask.byte()


with open('../dataset/DBLP/DBLP.pickle', 'rb') as f:
    a_list, p_list, c_list = pickle.load(f)
    pa_list, pc_list = pickle.load(f)
    author_features = pickle.load(f)
    labels = pickle.load(f)

# 构造异构网络
pa = dgl.bipartite(pa_list, 'paper', 'pa', 'author')
ap = dgl.bipartite(transpose(pa_list), 'author', 'ap', 'paper')
pc = dgl.bipartite(pc_list, 'paper', 'pc', 'conf')
cp = dgl.bipartite(transpose(pc_list), 'conf', 'cp', 'paper')
hg = dgl.hetero_from_relations([pa, ap, pc, cp])

features = torch.FloatTensor(author_features)
labels = torch.LongTensor(labels)

print(features.shape)
print(labels.shape)

num_class = 4

alls = [i for i in range(len(a_list))]
train_idx, x, _, _ = train_test_split(alls,
                                      labels,
                                      test_size=0.2,
                                      random_state=52)
eval_idx, test_idx, _, _ = train_test_split(x,
Beispiel #12
0
    def _generate_enc_graph(self,
                            rating_pairs,
                            rating_values,
                            add_support=False):
        user_movie_R = np.zeros((self._num_user, self._num_movie),
                                dtype=np.float32)
        user_movie_R[rating_pairs] = rating_values
        movie_user_R = user_movie_R.transpose()

        rating_graphs = []
        rating_row, rating_col = rating_pairs
        for rating in self.possible_rating_values:
            ridx = np.where(rating_values == rating)
            rrow = rating_row[ridx]
            rcol = rating_col[ridx]
            rating = str(rating).replace('.', '_')
            bg = dgl.bipartite((rrow, rcol),
                               'user',
                               rating,
                               'movie',
                               num_nodes=(self._num_user, self._num_movie))
            rev_bg = dgl.bipartite((rcol, rrow),
                                   'movie',
                                   'rev-%s' % rating,
                                   'user',
                                   num_nodes=(self._num_movie, self._num_user))
            rating_graphs.append(bg)
            rating_graphs.append(rev_bg)
        graph = dgl.hetero_from_relations(rating_graphs)

        # sanity check
        assert len(rating_pairs[0]) == sum(
            [graph.number_of_edges(et) for et in graph.etypes]) // 2

        if add_support:

            def _calc_norm(x):
                x = x.numpy().astype('float32')
                x[x == 0.] = np.inf
                x = th.FloatTensor(1. / np.sqrt(x))
                return x.to(self._device).unsqueeze(1)

            user_ci = []
            user_cj = []
            movie_ci = []
            movie_cj = []
            for r in self.possible_rating_values:
                r = str(r).replace('.', '_')
                user_ci.append(graph['rev-%s' % r].in_degrees())
                movie_ci.append(graph[r].in_degrees())
                if self._symm:
                    user_cj.append(graph[r].out_degrees())
                    movie_cj.append(graph['rev-%s' % r].out_degrees())
                else:
                    user_cj.append(th.zeros((self.num_user, )))
                    movie_cj.append(th.zeros((self.num_movie, )))
            user_ci = _calc_norm(sum(user_ci))
            movie_ci = _calc_norm(sum(movie_ci))
            if self._symm:
                user_cj = _calc_norm(sum(user_cj))
                movie_cj = _calc_norm(sum(movie_cj))
            else:
                user_cj = th.ones(self.num_user, ).to(self._device)
                movie_cj = th.ones(self.num_movie, ).to(self._device)
            graph.nodes['user'].data.update({'ci': user_ci, 'cj': user_cj})
            graph.nodes['movie'].data.update({'ci': movie_ci, 'cj': movie_cj})

        return graph
def ACNN_graph_construction_and_featurization(ligand_mol,
                                              protein_mol,
                                              ligand_coordinates,
                                              protein_coordinates,
                                              max_num_ligand_atoms=None,
                                              max_num_protein_atoms=None,
                                              neighbor_cutoff=12.,
                                              max_num_neighbors=12,
                                              strip_hydrogens=False):
    """Graph construction and featurization for `Atomic Convolutional Networks for
    Predicting Protein-Ligand Binding Affinity <https://arxiv.org/abs/1703.10603>`__.

    Parameters
    ----------
    ligand_mol : rdkit.Chem.rdchem.Mol
        RDKit molecule instance.
    protein_mol : rdkit.Chem.rdchem.Mol
        RDKit molecule instance.
    ligand_coordinates : Float Tensor of shape (V1, 3)
        Atom coordinates in a ligand.
    protein_coordinates : Float Tensor of shape (V2, 3)
        Atom coordinates in a protein.
    max_num_ligand_atoms : int or None
        Maximum number of atoms in ligands for zero padding, which should be no smaller than
        ligand_mol.GetNumAtoms() if not None. If None, no zero padding will be performed.
        Default to None.
    max_num_protein_atoms : int or None
        Maximum number of atoms in proteins for zero padding, which should be no smaller than
        protein_mol.GetNumAtoms() if not None. If None, no zero padding will be performed.
        Default to None.
    neighbor_cutoff : float
        Distance cutoff to define 'neighboring'. Default to 12.
    max_num_neighbors : int
        Maximum number of neighbors allowed for each atom. Default to 12.
    strip_hydrogens : bool
        Whether to exclude hydrogen atoms. Default to False.
    """
    assert ligand_coordinates is not None, 'Expect ligand_coordinates to be provided.'
    assert protein_coordinates is not None, 'Expect protein_coordinates to be provided.'
    if max_num_ligand_atoms is not None:
        assert max_num_ligand_atoms >= ligand_mol.GetNumAtoms(), \
            'Expect max_num_ligand_atoms to be no smaller than ligand_mol.GetNumAtoms(), ' \
            'got {:d} and {:d}'.format(max_num_ligand_atoms, ligand_mol.GetNumAtoms())
    if max_num_protein_atoms is not None:
        assert max_num_protein_atoms >= protein_mol.GetNumAtoms(), \
            'Expect max_num_protein_atoms to be no smaller than protein_mol.GetNumAtoms(), ' \
            'got {:d} and {:d}'.format(max_num_protein_atoms, protein_mol.GetNumAtoms())

    if strip_hydrogens:
        # Remove hydrogen atoms and their corresponding coordinates
        ligand_atom_indices_left = filter_out_hydrogens(ligand_mol)
        protein_atom_indices_left = filter_out_hydrogens(protein_mol)
        ligand_coordinates = ligand_coordinates.take(ligand_atom_indices_left, axis=0)
        protein_coordinates = protein_coordinates.take(protein_atom_indices_left, axis=0)
    else:
        ligand_atom_indices_left = list(range(ligand_mol.GetNumAtoms()))
        protein_atom_indices_left = list(range(protein_mol.GetNumAtoms()))

    # Compute number of nodes for each type
    if max_num_ligand_atoms is None:
        num_ligand_atoms = len(ligand_atom_indices_left)
    else:
        num_ligand_atoms = max_num_ligand_atoms

    if max_num_protein_atoms is None:
        num_protein_atoms = len(protein_atom_indices_left)
    else:
        num_protein_atoms = max_num_protein_atoms

    # Construct graph for atoms in the ligand
    ligand_srcs, ligand_dsts, ligand_dists = k_nearest_neighbors(
        ligand_coordinates, neighbor_cutoff, max_num_neighbors)
    ligand_graph = graph((ligand_srcs, ligand_dsts),
                         'ligand_atom', 'ligand', num_ligand_atoms)
    ligand_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy(
        np.array(ligand_dists).astype(np.float32)), (-1, 1))

    # Construct graph for atoms in the protein
    protein_srcs, protein_dsts, protein_dists = k_nearest_neighbors(
        protein_coordinates, neighbor_cutoff, max_num_neighbors)
    protein_graph = graph((protein_srcs, protein_dsts),
                          'protein_atom', 'protein', num_protein_atoms)
    protein_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy(
        np.array(protein_dists).astype(np.float32)), (-1, 1))

    # Construct 4 graphs for complex representation, including the connection within
    # protein atoms, the connection within ligand atoms and the connection between
    # protein and ligand atoms.
    complex_srcs, complex_dsts, complex_dists = k_nearest_neighbors(
        np.concatenate([ligand_coordinates, protein_coordinates]),
        neighbor_cutoff, max_num_neighbors)
    complex_srcs = np.array(complex_srcs)
    complex_dsts = np.array(complex_dsts)
    complex_dists = np.array(complex_dists)
    offset = num_ligand_atoms

    # ('ligand_atom', 'complex', 'ligand_atom')
    inter_ligand_indices = np.intersect1d(
        (complex_srcs < offset).nonzero()[0],
        (complex_dsts < offset).nonzero()[0],
        assume_unique=True)
    inter_ligand_graph = graph(
        (complex_srcs[inter_ligand_indices].tolist(),
         complex_dsts[inter_ligand_indices].tolist()),
        'ligand_atom', 'complex', num_ligand_atoms)
    inter_ligand_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy(
        complex_dists[inter_ligand_indices].astype(np.float32)), (-1, 1))

    # ('protein_atom', 'complex', 'protein_atom')
    inter_protein_indices = np.intersect1d(
        (complex_srcs >= offset).nonzero()[0],
        (complex_dsts >= offset).nonzero()[0],
        assume_unique=True)
    inter_protein_graph = graph(
        ((complex_srcs[inter_protein_indices] - offset).tolist(),
         (complex_dsts[inter_protein_indices] - offset).tolist()),
        'protein_atom', 'complex', num_protein_atoms)
    inter_protein_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy(
        complex_dists[inter_protein_indices].astype(np.float32)), (-1, 1))

    # ('ligand_atom', 'complex', 'protein_atom')
    ligand_protein_indices = np.intersect1d(
        (complex_srcs < offset).nonzero()[0],
        (complex_dsts >= offset).nonzero()[0],
        assume_unique=True)
    ligand_protein_graph = bipartite(
        (complex_srcs[ligand_protein_indices].tolist(),
         (complex_dsts[ligand_protein_indices] - offset).tolist()),
        'ligand_atom', 'complex', 'protein_atom',
        (num_ligand_atoms, num_protein_atoms))
    ligand_protein_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy(
        complex_dists[ligand_protein_indices].astype(np.float32)), (-1, 1))

    # ('protein_atom', 'complex', 'ligand_atom')
    protein_ligand_indices = np.intersect1d(
        (complex_srcs >= offset).nonzero()[0],
        (complex_dsts < offset).nonzero()[0],
        assume_unique=True)
    protein_ligand_graph = bipartite(
        ((complex_srcs[protein_ligand_indices] - offset).tolist(),
         complex_dsts[protein_ligand_indices].tolist()),
        'protein_atom', 'complex', 'ligand_atom',
        (num_protein_atoms, num_ligand_atoms))
    protein_ligand_graph.edata['distance'] = F.reshape(F.zerocopy_from_numpy(
        complex_dists[protein_ligand_indices].astype(np.float32)), (-1, 1))

    # Merge the graphs
    g = hetero_from_relations(
        [protein_graph,
         ligand_graph,
         inter_ligand_graph,
         inter_protein_graph,
         ligand_protein_graph,
         protein_ligand_graph]
    )

    # Get atomic numbers for all atoms left and set node features
    ligand_atomic_numbers = np.array(get_atomic_numbers(ligand_mol, ligand_atom_indices_left))
    # zero padding
    ligand_atomic_numbers = np.concatenate([
        ligand_atomic_numbers, np.zeros(num_ligand_atoms - len(ligand_atom_indices_left))])
    protein_atomic_numbers = np.array(get_atomic_numbers(protein_mol, protein_atom_indices_left))
    # zero padding
    protein_atomic_numbers = np.concatenate([
        protein_atomic_numbers, np.zeros(num_protein_atoms - len(protein_atom_indices_left))])

    g.nodes['ligand_atom'].data['atomic_number'] = F.reshape(F.zerocopy_from_numpy(
        ligand_atomic_numbers.astype(np.float32)), (-1, 1))
    g.nodes['protein_atom'].data['atomic_number'] = F.reshape(F.zerocopy_from_numpy(
        protein_atomic_numbers.astype(np.float32)), (-1, 1))

    # Prepare mask indicating the existence of nodes
    ligand_masks = np.zeros((num_ligand_atoms, 1))
    ligand_masks[:len(ligand_atom_indices_left), :] = 1
    g.nodes['ligand_atom'].data['mask'] = F.zerocopy_from_numpy(
        ligand_masks.astype(np.float32))
    protein_masks = np.zeros((num_protein_atoms, 1))
    protein_masks[:len(protein_atom_indices_left), :] = 1
    g.nodes['protein_atom'].data['mask'] = F.zerocopy_from_numpy(
        protein_masks.astype(np.float32))

    return g
Beispiel #14
0
    def encode_data(self):
        """
        Encode nodes & edges from data.json
        """
        # first create dictionary
        is_new_dict_node = False
        is_new_dict_edge = False
        if not os.path.isdir(self.vocab_path):
            os.makedirs(self.vocab_path)
        if not os.path.exists(self.vocab_path_node):
            self.create_dict_node()
            is_new_dict_node = True
        if not os.path.exists(self.vocab_path_edge):
            self.create_dict_edge()
            is_new_dict_edge = True
        # read from dict node
        with open(self.vocab_path_node, 'r') as f:
            vocab = f.read().strip()
            self.word_dict_node = vocab.split(' ')
            if is_new_dict_node is False:
                self.append_dict_node()
            # vocab_size = len(vocab)
            # print('vocab size: {}'.format(vocab_size))
            self.word_to_ix_node = {word: i for i,
                               word in enumerate(self.word_dict_node)}
        # read from dict edge
        with open(self.vocab_path_edge, 'r') as f:
            vocab = f.read().strip()
            self.word_dict_edge = vocab.split(' ')
            if is_new_dict_edge is False:
                self.append_dict_edge()
            # vocab_size = len(vocab)
            # print('vocab size: {}'.format(vocab_size))
            self.word_to_ix_edge = {word: i for i,
                               word in enumerate(self.word_dict_edge)}

        # num_token_node = len(self.word_dict_node)
        # self.embedding_node = nn.Embedding(num_token_node, 1)
        # num_token_edge = len(self.word_dict_edge)
        # self.embedding_edge = nn.Embedding(num_token_edge, 1)

        num_token = len(self.word_dict_node) + len(self.word_dict_edge)
        self.embedding = nn.Embedding(num_token, 1)

        if 'nodes' in self.json_data.keys():
            n_num = 0
            n_tot = len(self.json_data['nodes'])
            # self.embed_nodes = nn.Embedding(n_tot, 1)
            print('\nencode_node')
            for node in self.json_data['nodes']:
                # print('encode_node ', node)
                self.encode_node(node)
                n_num += 1
                if n_num % 1000 == 0 or n_num == n_tot:
                    print('{}/{}'.format(n_num, n_tot))
        
        for key in self.json_data:
            if key != 'nodes':
                p_num = 0
                p_tot = len(self.json_data[key])
                # self.embed_edges = nn.Embedding(p_tot, 1)
                print('\nencode_edge type '+key)
                for path in self.json_data[key]:
                    # print('encode_edge ', path)
                    self.encode_edge(path)
                    p_num += 1
                    if p_num % 1000 == 0 or p_num == p_tot:
                        print('{}/{}'.format(p_num, p_tot))

        g_proc_call_api = dgl.bipartite([self.proc_call_api['proc'], self.proc_call_api['api']], 'proc', 'call', 'api')

        g_file_affect_api = dgl.bipartite([self.file_affect_api['file'], self.file_affect_api['api']], 'file', 'affect', 'api')
        g_api_modify_file = dgl.bipartite([self.api_modify_file['api'], self.api_modify_file['file']], 'api', 'modify', 'file')

        g_reg_affect_api = dgl.bipartite([self.reg_affect_api['reg'], self.reg_affect_api['api']], 'reg', 'affect', 'api')
        g_api_modify_reg = dgl.bipartite([self.api_modify_reg['api'], self.api_modify_reg['reg']], 'api', 'modify', 'reg')
        
        self.hetero_g = dgl.hetero_from_relations([g_proc_call_api, g_file_affect_api, g_api_modify_file, g_reg_affect_api, g_api_modify_reg])
Beispiel #15
0
def construct_graph():
    api_ids = []
    api_names = []
    app_ids = []
    app_names = []

    f_3 = open(os.path.join(path, "id_api_320.txt"), encoding='utf-8')
    f_4 = open(os.path.join(path, "id_app_320.txt"), encoding='utf-8')
    while True:
        z = f_3.readline()
        if not z:
            break
        z = z.strip().split()
        identity = int(z[0])
        api_ids.append(identity)
        api_names.append(z[1])
    while True:
        w = f_4.readline()
        if not w:
            break
        w = w.strip().split()
        identity = int(w[0])
        app_ids.append(identity)
        app_names.append(w[1])

    f_3.close()
    f_4.close()

    api_ids_invmap = {x: i for i, x in enumerate(api_ids)}
    app_ids_invmap = {x: i for i, x in enumerate(app_ids)}

    api_api_B_src = []
    api_api_B_dst = []

    api_api_P_src = []
    api_api_P_dst = []

    api_app_src = []
    api_app_dst = []
    f_1 = open(os.path.join(path, "same_block_api_320.txt"), "r")  # B matrix
    f_2 = open(os.path.join(path, "api_app_320.txt"), "r")  # A matrix
    f_5 = open(os.path.join(path, "same_package_api_320.txt"), "r")  # P matrix

    # B
    for x in f_1:
        x = x.split()
        x[0] = int(x[0])
        x[1] = int(x[1].strip('\n'))
        api_api_B_src.append(api_ids_invmap[x[0]])
        api_api_B_dst.append(api_ids_invmap[x[1]])

    # A
    for y in f_2:
        y = y.split()
        y[0] = int(y[0])
        y[1] = int(y[1].strip('\n'))
        api_app_src.append(api_ids_invmap[y[0]])
        api_app_dst.append(app_ids_invmap[y[1]])

    # P
    for z in f_5:
        z = z.split()
        z[0] = int(z[0])
        z[1] = int(z[1].strip('\n'))
        api_api_P_src.append(api_ids_invmap[z[0]])
        api_api_P_dst.append(api_ids_invmap[z[1]])

    f_1.close()
    f_2.close()
    f_5.close()

    app_api = dgl.bipartite((api_app_dst, api_app_src), 'app', 'app_api',
                            'api1')
    api_api_B = dgl.bipartite((api_api_B_src, api_api_B_dst), 'api1',
                              'api_api_B', 'api2')
    api_api_P = dgl.bipartite((api_api_P_src, api_api_P_dst), 'api2',
                              'api_api_P', 'api3')
    api_api_B_T = dgl.bipartite((api_api_B_dst, api_api_B_src), 'api3',
                                'api_api_B_T', 'api1')  # B transpose
    api_app = dgl.bipartite((api_app_src, api_app_dst), 'api1', 'api_app',
                            'app')  # A transpose

    hg = dgl.hetero_from_relations(
        [app_api, api_api_B, api_api_P, api_api_B_T, api_app])
    return hg, api_names, app_names
for item in np.unique(items):
    iid2vid[item] = inc
    vid2iid[inc] = item
    inc += 1
assert((len(iid2vid)+len(uid2vid)) == total_vertices)

src = list(map(lambda x: uid2vid[x], users))
dst = list(map(lambda x: iid2vid[x], items))

click_graph = dgl.bipartite(list(zip(src,dst)), 'user', 'ui', 'item')
click_graph.edges['ui'].data['timestamp']=timestamps
click_graph.edges['ui'].data['rating']=torch.ones(click_graph.number_of_edges())
clicked_graph = dgl.bipartite(list(zip(dst,src)), 'item', 'iu', 'user')
clicked_graph.edges['iu'].data['timestamp']=timestamps
clicked_graph.edges['iu'].data['rating']=torch.ones(clicked_graph.number_of_edges())
g = dgl.hetero_from_relations({click_graph, clicked_graph})

with open(directory+"/underexpose_train/user_generate_feat.txt", "r")as f:
    lines = f.readlines()
    usr_feat = np.zeros((len(lines), 5))
    for i in range(len(lines)):
        if lines[i].split(",")[2] == "0":
            usr_feat[i][3] = 1
        if lines[i].split(",")[2] == "1":
            usr_feat[i][4] = 1
    del lines
fn = directory+"/underexpose_train/user_generate_feat.txt"
usr_data = genfromtxt(fn, delimiter=',', dtype=np.int16)
usr_feat[:, 0:2] = usr_data[:, 0:2]
usr_feat[:, 2] = usr_data[:, 3]
uid2feat = {}