Esempio n. 1
0
File: rdf.py Progetto: zhoujf620/dgl
    def build_graph(self, mg, src, dst, ntid, etid, ntypes, etypes):
        # create h**o graph
        print('Creating one whole graph ...')
        g = dgl.graph((src, dst))
        g.ndata[dgl.NTYPE] = F.tensor(ntid)
        g.edata[dgl.ETYPE] = F.tensor(etid)
        print('Total #nodes:', g.number_of_nodes())
        print('Total #edges:', g.number_of_edges())

        # rename names such as 'type' so that they an be used as keys
        # to nn.ModuleDict
        etypes = [RENAME_DICT.get(ty, ty) for ty in etypes]
        mg_edges = mg.edges(keys=True)
        mg = nx.MultiDiGraph()
        for sty, dty, ety in mg_edges:
            mg.add_edge(sty, dty, key=RENAME_DICT.get(ety, ety))

        # convert to heterograph
        print('Convert to heterograph ...')
        hg = dgl.to_hetero(g,
                           ntypes,
                           etypes,
                           metagraph=mg)
        print('#Node types:', len(hg.ntypes))
        print('#Canonical edge types:', len(hg.etypes))
        print('#Unique edge type names:', len(set(hg.etypes)))
        self.graph = hg
Esempio n. 2
0
def create_partition_policy(args):
    """Create GraphPartitionBook and PartitionPolicy
    """
    g = dgl.DGLGraph()
    g.add_nodes(args.graph_size)
    g.add_edge(0, 1) # we don't use edge data in our benchmark

    global_nid = F.tensor(np.arange(args.graph_size) + args.machine_id * args.graph_size)
    global_eid = F.tensor([args.machine_id])

    node_map = np.zeros((args.graph_size*2), np.int64)
    node_map[args.graph_size:] = 1
    node_map = F.tensor(node_map)
    edge_map = F.tensor([0,1])

    g.ndata[dgl.NID] = global_nid
    g.edata[dgl.EID] = global_eid

    gpb = dgl.distributed.GraphPartitionBook(part_id=args.machine_id,
                                             num_parts=args.num_machine,
                                             node_map=node_map,
                                             edge_map=edge_map,
                                             part_graph=g)

    policy = dgl.distributed.PartitionPolicy(policy_str='node',
                                             part_id=args.machine_id,
                                             partition_book=gpb)
    return policy, gpb
Esempio n. 3
0
    def init_data(self, name, shape, init_type='zero', low=0.0, high=0.0):
        """Initialize kvstore tensor

        Parameters
        ----------
        name : str
            data name
        shape : list of int
            shape of tensor
        init_type : str
            initialize method, including 'zero' and 'uniform'
        low : float
            min threshold, if use 'uniform'
        high : float
            max threshold, if use 'uniform'
        """
        self._data_size[name] = shape[0]
        count = math.ceil(shape[0] / self._server_count)
        # We hack the msg format here
        init_type = 0.0 if init_type == 'zero' else 1.0
        threshold = F.tensor([[init_type, init_type], [low, high]])
        # partition shape on server
        for server_id in range(self._server_count):
            par_shape = shape.copy()
            if shape[0] - server_id * count >= count:
                par_shape[0] = count
            else:
                par_shape[0] = shape[0] - server_id * count
            tensor_shape = F.tensor(par_shape)
            msg = KVStoreMsg(type=KVMsgType.INIT,
                             rank=self._client_id,
                             name=name,
                             id=tensor_shape,
                             data=threshold)
            _send_kv_msg(self._sender, msg, server_id)
Esempio n. 4
0
    def build_graph(self, mg, src, dst, ntid, etid, ntypes, etypes):
        """Build the graphs

        Parameters
        ----------
        mg: MultiDiGraph
            Input graph
        src: Numpy array
            Source nodes
        dst: Numpy array
            Destination nodes
        ntid: Numpy array
            Node types for each node
        etid: Numpy array
            Edge types for each edge
        ntypes: list
            Node types
        etypes: list
            Edge types

        Returns
        -------
        g: DGLGraph
        """
        # create h**o graph
        if self.verbose:
            print('Creating one whole graph ...')
        g = dgl.graph((src, dst))
        g.ndata[dgl.NTYPE] = F.tensor(ntid)
        g.edata[dgl.ETYPE] = F.tensor(etid)
        if self.verbose:
            print('Total #nodes:', g.number_of_nodes())
            print('Total #edges:', g.number_of_edges())

        # rename names such as 'type' so that they an be used as keys
        # to nn.ModuleDict
        etypes = [RENAME_DICT.get(ty, ty) for ty in etypes]
        mg_edges = mg.edges(keys=True)
        mg = nx.MultiDiGraph()
        for sty, dty, ety in mg_edges:
            mg.add_edge(sty, dty, key=RENAME_DICT.get(ety, ety))

        # convert to heterograph
        if self.verbose:
            print('Convert to heterograph ...')
        hg = dgl.to_hetero(g,
                           ntypes,
                           etypes,
                           metagraph=mg)
        if self.verbose:
            print('#Node types:', len(hg.ntypes))
            print('#Canonical edge types:', len(hg.etypes))
            print('#Unique edge type names:', len(set(hg.etypes)))
        return hg
Esempio n. 5
0
def alchemy_edges(mol, self_loop=False):
    """Featurization for all bonds in a molecule.
    The bond indices will be preserved.
    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule object
    self_loop : bool
        Whether to add self loops. Default to be False.
    Returns
    -------
    bond_feats_dict : dict
        Dictionary for bond features
    """
    bond_feats_dict = defaultdict(list)

    mol_conformers = mol.GetConformers()
    assert len(mol_conformers) == 1
    geom = mol_conformers[0].GetPositions()

    num_atoms = mol.GetNumAtoms()
    for u in range(num_atoms):
        for v in range(num_atoms):
            if u == v and not self_loop:
                continue

            e_uv = mol.GetBondBetweenAtoms(u, v)
            if e_uv is None:
                bond_type = None
            else:
                bond_type = e_uv.GetBondType()
            bond_feats_dict["e_feat"].append(
                [
                    float(bond_type == x)
                    for x in (
                        Chem.rdchem.BondType.SINGLE,
                        Chem.rdchem.BondType.DOUBLE,
                        Chem.rdchem.BondType.TRIPLE,
                        Chem.rdchem.BondType.AROMATIC,
                        None,
                    )
                ]
            )
            bond_feats_dict["distance"].append(np.linalg.norm(geom[u] - geom[v]))

    bond_feats_dict["e_feat"] = FF.tensor(
        np.array(bond_feats_dict["e_feat"]).astype(np.float32)
    )
    bond_feats_dict["distance"] = FF.tensor(
        np.array(bond_feats_dict["distance"]).astype(np.float32)
    ).reshape(-1, 1)

    return bond_feats_dict
Esempio n. 6
0
 def load_data_split(self, ent2id):
     label_dict = {}
     labels = np.zeros(
         (self.graph.number_of_nodes(self.predict_category), )) - 1
     train_idx = self.parse_idx_file(
         os.path.join(self._dir, 'trainingSet.tsv'), ent2id, label_dict,
         labels)
     test_idx = self.parse_idx_file(os.path.join(self._dir, 'testSet.tsv'),
                                    ent2id, label_dict, labels)
     self.train_idx = F.tensor(train_idx)
     self.test_idx = F.tensor(test_idx)
     self.labels = F.tensor(labels).long()
     self.num_classes = len(label_dict)
Esempio n. 7
0
    def generate_samplers(self):

        # Generate train samplers
        train_samplers = []
        for i in range(self.num_proc):
            print("Building training sampler for proc %d" % i)
            t1 = time.time()
            # for each GPU, allocate num_proc // num_GPU processes
            train_sampler_head = dgl.contrib.sampling.EdgeSampler(
                self.train_data,
                seed_edges=F.tensor(self.train_data.edge_parts[i]),
                batch_size=self.batch_size,
                neg_sample_size=self.neg_sample_size,
                chunk_size=self.neg_sample_size,
                negative_mode='head',
                num_workers=self.num_workers,
                shuffle=True,
                exclude_positive=False,
                return_false_neg=False,
            )

            train_sampler_tail = dgl.contrib.sampling.EdgeSampler(
                self.train_data,
                seed_edges=F.tensor(self.train_data.edge_parts[i]),
                batch_size=self.batch_size,
                neg_sample_size=self.neg_sample_size,
                chunk_size=self.neg_sample_size,
                negative_mode='tail',
                num_workers=self.num_workers,
                shuffle=True,
                exclude_positive=False,
                return_false_neg=False,
            )

            print(train_sampler_head)
            print(train_sampler_tail)

            train_samplers.append(
                NewBidirectionalOneShotIterator(
                    dataloader_head=train_sampler_head,
                    dataloader_tail=train_sampler_tail,
                    neg_chunk_size=self.neg_sample_size,
                    neg_sample_size=self.neg_sample_size,
                    is_chunked=True,
                    num_nodes=self.n_entities,
                    has_edge_importance=self.has_edge_importance,
                ))
            print("Training sampler for proc {} created, it takes {} seconds".
                  format(i,
                         time.time() - t1))
Esempio n. 8
0
def create_range_partition_policy(args):
    """Create RangePartitionBook and PartitionPolicy
    """
    node_map = F.tensor(np.array([args.graph_size, 2*args.graph_size], np.int64))
    edge_map = F.tensor([1,2])

    gpb = dgl.distributed.RangePartitionBook(part_id=args.machine_id,
                                             num_parts=args.num_machine,
                                             node_map=node_map,
                                             edge_map=edge_map)

    policy = dgl.distributed.PartitionPolicy(policy_str='node',
                                             partition_book=gpb)
    return policy, gpb 
Esempio n. 9
0
File: rdf.py Progetto: zhoujf620/dgl
    def load_cache(self):
        mg = nx.read_gpickle(os.path.join(self._dir, 'cached_mg.gpickle'))
        src = np.load(os.path.join(self._dir, 'cached_src.npy'))
        dst = np.load(os.path.join(self._dir, 'cached_dst.npy'))
        ntid = np.load(os.path.join(self._dir, 'cached_ntid.npy'))
        etid = np.load(os.path.join(self._dir, 'cached_etid.npy'))
        ntypes = load_strlist(os.path.join(self._dir, 'cached_ntypes.txt'))
        etypes = load_strlist(os.path.join(self._dir, 'cached_etypes.txt'))
        self.train_idx = F.tensor(np.load(os.path.join(self._dir, 'cached_train_idx.npy')))
        self.test_idx = F.tensor(np.load(os.path.join(self._dir, 'cached_test_idx.npy')))
        labels = np.load(os.path.join(self._dir, 'cached_labels.npy'))
        self.num_classes = labels.max() + 1
        self.labels = F.tensor(labels)

        self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes)
Esempio n. 10
0
    def pull_model(self, client, pos_g, neg_g):
        with th.no_grad():
            entity_id = F.cat(seq=[pos_g.ndata["id"], neg_g.ndata["id"]], dim=0)
            relation_id = pos_g.edata["id"]
            entity_id = F.tensor(np.unique(F.asnumpy(entity_id)))
            relation_id = F.tensor(np.unique(F.asnumpy(relation_id)))

            l2g = client.get_local2global()
            global_entity_id = l2g[entity_id]

            entity_data = client.pull(name="entity_emb", id_tensor=global_entity_id)
            relation_data = client.pull(name="relation_emb", id_tensor=relation_id)

            self.entity_emb.emb[entity_id] = entity_data
            self.relation_emb.emb[relation_id] = relation_data
Esempio n. 11
0
    def build_training_dataset(self):

        # construct the training Graph
        print("To build training dataset")
        t1 = time.time()
        # extract source, edge typy and target from the triplets
        src, etype_id, dst = self.train
        coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)),
                                   shape=[self.n_entities, self.n_entities])
        self.train_data = dgl.DGLGraph(coo,
                                       readonly=True,
                                       multigraph=True,
                                       sort_csr=True)
        self.train_data.edata['tid'] = F.tensor(etype_id, F.int64)
        self.train_data.edge_parts = CustomPartition(
            edges=self.train,
            n=self.num_proc,
            p=0.1,
            has_importance=self.has_edge_importance)
        # self.train_data.edge_parts = RandomPartition(edges=self.train, n=self.num_proc, has_importance=self.has_edge_importance)
        print(self.train_data.edge_parts[0])
        print(len(self.train_data.edge_parts[0]))
        self.train_data.cross_part = True
        print("Training dataset built, it takes %d seconds" %
              (time.time() - t1))
Esempio n. 12
0
    def build_graph(self, mg, src, dst, ntid, etid, ntypes, etypes):
        # create h**o graph
        print('Creating one whole graph ...')
        g = dgl.graph((src, dst))
        g.ndata[dgl.NTYPE] = F.tensor(ntid)
        g.edata[dgl.ETYPE] = F.tensor(etid)
        print('Total #nodes:', g.number_of_nodes())
        print('Total #edges:', g.number_of_edges())

        # convert to heterograph
        print('Convert to heterograph ...')
        hg = dgl.to_hetero(g, ntypes, etypes, metagraph=mg)
        print('#Node types:', len(hg.ntypes))
        print('#Canonical edge types:', len(hg.etypes))
        print('#Unique edge type names:', len(set(hg.etypes)))
        self.graph = hg
Esempio n. 13
0
    def _load(self):
        # load vocab file
        self.vocab = OrderedDict()
        with open(self.vocab_file, encoding='utf-8') as vf:
            for line in vf.readlines():
                line = line.strip()
                self.vocab[line] = len(self.vocab)

        # filter glove
        if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
            glove_emb = {}
            with open(self.pretrained_file, 'r', encoding='utf-8') as pf:
                for line in pf.readlines():
                    sp = line.split(' ')
                    if sp[0].lower() in self.vocab:
                        glove_emb[sp[0].lower()] = np.array([float(x) for x in sp[1:]])
        files = ['{}.txt'.format(self.mode)]
        corpus = BracketParseCorpusReader('{}/sst'.format(self.dir), files)
        sents = corpus.parsed_sents(files[0])

        #initialize with glove
        pretrained_emb = []
        fail_cnt = 0
        for line in self.vocab.keys():
            if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
                if not line.lower() in glove_emb:
                    fail_cnt += 1
                pretrained_emb.append(glove_emb.get(line.lower(), np.random.uniform(-0.05, 0.05, 300)))

        if self.pretrained_file != '' and os.path.exists(self.pretrained_file):
            self.pretrained_emb = F.tensor(np.stack(pretrained_emb, 0))
            print('Miss word in GloVe {0:.4f}'.format(1.0*fail_cnt/len(self.pretrained_emb)))
        # build trees
        for sent in sents:
            self.trees.append(self._build_tree(sent))
Esempio n. 14
0
def ConstructGraph(edges, n_entities, args):
    """Construct Graph for training

    Parameters
    ----------
    edges : (heads, rels, tails) triple
        Edge list
    n_entities : int
        number of entities
    args :
        Global configs.
    """
    pickle_name = 'graph_train.pickle'
    if args.pickle_graph and os.path.exists(
            os.path.join(args.data_path, args.dataset, pickle_name)):
        with open(os.path.join(args.data_path, args.dataset, pickle_name),
                  'rb') as graph_file:
            g = pickle.load(graph_file)
            print('Load pickled graph.')
    else:
        src, etype_id, dst = edges
        coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)),
                                   shape=[n_entities, n_entities])
        g = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True)
        g.edata['tid'] = F.tensor(etype_id, F.int64)
        if args.pickle_graph:
            with open(os.path.join(args.data_path, args.dataset, pickle_name),
                      'wb') as graph_file:
                pickle.dump(g, graph_file)
    return g
Esempio n. 15
0
def generate_rand_graph(n, func_name):
    arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype(
        np.int64)
    g = dgl.DGLGraph(arr, readonly=True)
    num_rels = 10
    entity_emb = F.uniform((g.number_of_nodes(), 10), F.float32, F.cpu(), 0, 1)
    if func_name == 'RotatE':
        entity_emb = F.uniform((g.number_of_nodes(), 20), F.float32, F.cpu(),
                               0, 1)
    rel_emb = F.uniform((num_rels, 10), F.float32, F.cpu(), -1, 1)
    if func_name == 'RESCAL':
        rel_emb = F.uniform((num_rels, 10 * 10), F.float32, F.cpu(), 0, 1)
    g.ndata['id'] = F.arange(0, g.number_of_nodes())
    rel_ids = np.random.randint(0,
                                num_rels,
                                g.number_of_edges(),
                                dtype=np.int64)
    g.edata['id'] = F.tensor(rel_ids, F.int64)
    # TransR have additional projection_emb
    if (func_name == 'TransR'):
        args = {'gpu': -1, 'lr': 0.1}
        args = dotdict(args)
        projection_emb = ExternalEmbedding(args, 10, 10 * 10, F.cpu())
        return g, entity_emb, rel_emb, (12.0, projection_emb, 10, 10)
    elif (func_name == 'TransE'):
        return g, entity_emb, rel_emb, (12.0)
    elif (func_name == 'RESCAL'):
        return g, entity_emb, rel_emb, (10, 10)
    elif (func_name == 'RotatE'):
        return g, entity_emb, rel_emb, (12.0, 1.0)
    else:
        return g, entity_emb, rel_emb, None
Esempio n. 16
0
def start_server(args):
    """Start kvstore service
    """
    server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config)

    my_server = KVServer(server_id=args.server_id,
                         server_namebook=server_namebook,
                         num_client=args.num_client)

    data = F.zeros((num_entries, args.dim_size), F.float32, F.cpu())
    g2l = F.zeros(num_entries * args.num_servers, F.int64, F.cpu())
    start = num_entries * my_server.get_machine_id()
    end = num_entries * (my_server.get_machine_id() + 1)
    g2l[start:end] = F.arange(0, num_entries)

    partition = np.arange(args.num_servers)
    partition = F.tensor(np.repeat(partition, num_entries))
    if my_server.get_id() % my_server.get_group_count() == 0:  # master server
        my_server.set_global2local(name='entity_embed', global2local=g2l)
        my_server.init_data(name='entity_embed', data_tensor=data)
        my_server.set_partition_book(name='entity_embed',
                                     partition_book=partition)
    else:
        my_server.set_global2local(name='entity_embed')
        my_server.init_data(name='entity_embed')
        my_server.set_partition_book(name='entity_embed')

    my_server.print()

    my_server.start()
Esempio n. 17
0
    def split_dataset(self, dataset_pairs, label_dict, glb2lcl):
        total = len(dataset_pairs)
        train_set_size = int(total * 0.8)
        entities, truths = zip(*dataset_pairs)
        local_entities = []
        labels = np.zeros((self.graph.number_of_nodes(self.predict_category),)) - 1
        for i, entity in enumerate(entities):
            local_id = glb2lcl[entity]
            local_entities.append(local_id)
            labels[local_id] = truths[i]
        train_entities = local_entities[:train_set_size]
        test_entities = local_entities[train_set_size:]

        self.train_idx = F.tensor(train_entities)
        self.test_idx = F.tensor(test_entities)
        self.labels = F.tensor(labels).long()
        self.num_classes = len(label_dict)
Esempio n. 18
0
    def process(self):
        DS_edge_list = self._idx_from_zero(
            loadtxt(self._file_path("A"), delimiter=",").astype(int))
        DS_indicator = self._idx_from_zero(
            loadtxt(self._file_path("graph_indicator"),
                    delimiter=",").astype(int))
        DS_graph_labels = self._idx_reset(
            loadtxt(self._file_path("graph_labels"),
                    delimiter=",").astype(int))

        g = dgl_graph(([], []))
        g.add_nodes(int(DS_edge_list.max()) + 1)
        g.add_edges(DS_edge_list[:, 0], DS_edge_list[:, 1])

        node_idx_list = []
        self.max_num_node = 0
        for idx in range(np.max(DS_indicator) + 1):
            node_idx = np.where(DS_indicator == idx)
            node_idx_list.append(node_idx[0])
            if len(node_idx[0]) > self.max_num_node:
                self.max_num_node = len(node_idx[0])

        self.num_labels = max(DS_graph_labels) + 1
        self.graph_labels = F.tensor(DS_graph_labels)

        self.attr_dict = {
            'node_labels': ('ndata', 'node_labels'),
            'node_attributes': ('ndata', 'node_attr'),
            'edge_labels': ('edata', 'edge_labels'),
            'edge_attributes': ('edata', 'node_labels'),
        }

        for filename, field_name in self.attr_dict.items():
            try:
                data = loadtxt(self._file_path(filename),
                               delimiter=',').astype(float)
                if 'label' in filename:
                    data = F.tensor(self._idx_from_zero(data))
                else:
                    data = F.tensor(data)
                getattr(g, field_name[0])[field_name[1]] = data
            except IOError:
                pass

        self.graph_lists = [g.subgraph(node_idx) for node_idx in node_idx_list]
Esempio n. 19
0
 def _load_node_feature(self, device):
     if len(self._features) == 1 and self._features[0].is_homo:
         features = self._features[0]
         ft = F.tensor(features.features)
         ft = F.copy_to(ft, device)
         self._g.ndata['homo_f'] = ft
     else:
         # (TODO xiangsx) heto graph
         assert False
Esempio n. 20
0
    def __init__(self, dataset, args):
        pickle_name = 'graph_all.pickle'
        if args.pickle_graph and os.path.exists(
                os.path.join(args.data_path, args.dataset, pickle_name)):
            with open(os.path.join(args.data_path, args.dataset, pickle_name),
                      'rb') as graph_file:
                g = pickle.load(graph_file)
                print('Load pickled graph.')
        else:
            src = np.concatenate(
                (dataset.train[0], dataset.valid[0], dataset.test[0]))
            etype_id = np.concatenate(
                (dataset.train[1], dataset.valid[1], dataset.test[1]))
            dst = np.concatenate(
                (dataset.train[2], dataset.valid[2], dataset.test[2]))
            coo = sp.sparse.coo_matrix(
                (np.ones(len(src)), (src, dst)),
                shape=[dataset.n_entities, dataset.n_entities])
            g = dgl.DGLGraph(coo, readonly=True, sort_csr=True)
            g.ndata['id'] = F.arange(0, g.number_of_nodes())
            g.edata['id'] = F.tensor(etype_id, F.int64)
            if args.pickle_graph:
                with open(
                        os.path.join(args.data_path, args.dataset,
                                     pickle_name), 'wb') as graph_file:
                    pickle.dump(g, graph_file)
        self.g = g

        self.num_train = len(dataset.train[0])
        self.num_valid = len(dataset.valid[0])
        self.num_test = len(dataset.test[0])

        if args.eval_percent < 1:
            self.valid = np.random.randint(
                0,
                self.num_valid,
                size=(int(
                    self.num_valid * args.eval_percent), )) + self.num_train
        else:
            self.valid = np.arange(self.num_train,
                                   self.num_train + self.num_valid)
        print('|valid|:', len(self.valid))

        if args.eval_percent < 1:
            self.test = np.random.randint(
                0,
                self.num_test,
                size=(int(self.num_test * args.eval_percent, )))
            self.test += self.num_train + self.num_valid
        else:
            self.test = np.arange(self.num_train + self.num_valid,
                                  self.g.number_of_edges())
        print('|test|:', len(self.test))

        self.num_valid = len(self.valid)
        self.num_test = len(self.test)
Esempio n. 21
0
    def create_sampler(
        self,
        batch_size,
        neg_sample_size=2,
        neg_chunk_size=None,
        mode="head",
        num_workers=32,
        shuffle=True,
        exclude_positive=False,
        rank=0,
    ):
        """Create sampler for training

        Parameters
        ----------
        batch_size : int
            Batch size of each mini batch.
        neg_sample_size : int
            How many negative edges sampled for each node.
        neg_chunk_size : int
            How many edges in one chunk. We split one batch into chunks.
        mode : str
            Sampling mode.
        number_workers: int
            Number of workers used in parallel for this sampler
        shuffle : bool
            If True, shuffle the seed edges.
            If False, do not shuffle the seed edges.
            Default: False
        exclude_positive : bool
            If True, exlucde true positive edges in sampled negative edges
            If False, return all sampled negative edges even there are positive edges
            Default: False
        rank : int
            Which partition to sample.

        Returns
        -------
        dgl.contrib.sampling.EdgeSampler
            Edge sampler
        """
        EdgeSampler = getattr(dgl.contrib.sampling, "EdgeSampler")
        assert batch_size % neg_sample_size == 0, "batch_size should be divisible by B"
        return EdgeSampler(
            self.g,
            seed_edges=F.tensor(self.edge_parts[rank]),
            batch_size=batch_size,
            neg_sample_size=int(neg_sample_size / neg_chunk_size),
            chunk_size=neg_chunk_size,
            negative_mode=mode,
            num_workers=num_workers,
            shuffle=shuffle,
            exclude_positive=exclude_positive,
            return_false_neg=False,
        )
    def __init__(self, dataset, args):
        triples = dataset.train + dataset.valid + dataset.test
        pickle_name = "graph_all.pickle"
        if args.pickle_graph and os.path.exists(
                os.path.join(args.data_path, args.dataset, pickle_name)):
            with open(os.path.join(args.data_path, args.dataset, pickle_name),
                      "rb") as graph_file:
                g = pickle.load(graph_file)
                print("Load pickled graph.")
        else:
            src = [t[0] for t in triples]
            etype_id = [t[1] for t in triples]
            dst = [t[2] for t in triples]
            coo = sp.sparse.coo_matrix(
                (np.ones(len(src)), (src, dst)),
                shape=[dataset.n_entities, dataset.n_entities])
            g = dgl.DGLGraph(coo, readonly=True, sort_csr=True)
            g.ndata["id"] = F.arange(0, g.number_of_nodes())
            g.edata["id"] = F.tensor(etype_id, F.int64)
            if args.pickle_graph:
                with open(
                        os.path.join(args.data_path, args.dataset,
                                     pickle_name), "wb") as graph_file:
                    pickle.dump(g, graph_file)
        self.g = g

        self.num_train = len(dataset.train)
        self.num_valid = len(dataset.valid)
        self.num_test = len(dataset.test)

        if args.eval_percent < 1:
            self.valid = (np.random.randint(
                0,
                self.num_valid,
                size=(int(self.num_valid * args.eval_percent), )) +
                          self.num_train)
        else:
            self.valid = np.arange(self.num_train,
                                   self.num_train + self.num_valid)
        print("|valid|:", len(self.valid))

        if args.eval_percent < 1:
            self.test = np.random.randint(
                0,
                self.num_test,
                size=(int(self.num_test * args.eval_percent, )),
            )
            self.test += self.num_train + self.num_valid
        else:
            self.test = np.arange(self.num_train + self.num_valid,
                                  self.g.number_of_edges())
        print("|test|:", len(self.test))

        self.num_valid = len(self.valid)
        self.num_test = len(self.test)
Esempio n. 23
0
    def _load(self, mol_to_graph, node_featurizer, edge_featurizer):
        if self.load:
            self.graphs, label_dict = load_graphs(
                osp.join(self.file_dir, "{}_graphs.bin".format(self.mode)))
            self.labels = label_dict['labels']
            with open(
                    osp.join(self.file_dir, "{}_smiles.txt".format(self.mode)),
                    'r') as f:
                smiles_ = f.readlines()
                self.smiles = [s.strip() for s in smiles_]
        else:
            print('Start preprocessing dataset...')
            target_file = pathlib.Path(self.file_dir,
                                       "{}_target.csv".format(self.mode))
            self.target = pd.read_csv(
                target_file,
                index_col=0,
                usecols=[
                    'gdb_idx',
                ] + ['property_{:d}'.format(x) for x in range(12)])
            self.target = self.target[[
                'property_{:d}'.format(x) for x in range(12)
            ]]
            self.graphs, self.labels, self.smiles = [], [], []

            supp = Chem.SDMolSupplier(
                osp.join(self.file_dir, self.mode + ".sdf"))
            cnt = 0
            dataset_size = len(self.target)
            for mol, label in zip(supp, self.target.iterrows()):
                cnt += 1
                print('Processing molecule {:d}/{:d}'.format(
                    cnt, dataset_size))
                graph = mol_to_graph(mol,
                                     node_featurizer=node_featurizer,
                                     edge_featurizer=edge_featurizer)
                smiles = Chem.MolToSmiles(mol)
                self.smiles.append(smiles)
                self.graphs.append(graph)
                label = F.tensor(
                    np.array(label[1].tolist()).astype(np.float32))
                self.labels.append(label)

            save_graphs(osp.join(self.file_dir,
                                 "{}_graphs.bin".format(self.mode)),
                        self.graphs,
                        labels={'labels': F.stack(self.labels, dim=0)})
            with open(
                    osp.join(self.file_dir, "{}_smiles.txt".format(self.mode)),
                    'w') as f:
                for s in self.smiles:
                    f.write(s + '\n')

        self.set_mean_and_std()
        print(len(self.graphs), "loaded!")
Esempio n. 24
0
def main():
    parser = argparse.ArgumentParser(description='Partition a knowledge graph')
    parser.add_argument('--data_path', type=str, default='data',
                        help='root path of all dataset')
    parser.add_argument('--dataset', type=str, default='FB15k',
                        help='dataset name, under data_path')
    parser.add_argument('--data_files', type=str, default=None, nargs='+',
                        help='a list of data files, e.g. entity relation train valid test')
    parser.add_argument('--format', type=str, default='built_in',
                        help='the format of the dataset, it can be built_in,'\
                                'raw_udd_{htr} and udd_{htr}')
    parser.add_argument('-k', '--num-parts', required=True, type=int,
                        help='The number of partitions')
    args = parser.parse_args()
    num_parts = args.num_parts

    print('load dataset..')

    # load dataset and samplers
    dataset = get_dataset(args.data_path, args.dataset, args.format, args.data_files)

    print('construct graph...')

    src, etype_id, dst = dataset.train
    coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)),
            shape=[dataset.n_entities, dataset.n_entities])
    g = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True)
    g.edata['tid'] = F.tensor(etype_id, F.int64)

    print('partition graph...')

    part_dict = dgl.transform.metis_partition(g, num_parts, 1)

    tot_num_inner_edges = 0
    for part_id in part_dict:
        part = part_dict[part_id]

        num_inner_nodes = len(np.nonzero(F.asnumpy(part.ndata['inner_node']))[0])
        num_inner_edges = len(np.nonzero(F.asnumpy(part.edata['inner_edge']))[0])
        print('part {} has {} nodes and {} edges. {} nodes and {} edges are inside the partition'.format(
              part_id, part.number_of_nodes(), part.number_of_edges(),
              num_inner_nodes, num_inner_edges))
        tot_num_inner_edges += num_inner_edges

        part.copy_from_parent()

    print('write graph to txt file...')

    txt_file_graph = os.path.join(args.data_path, args.dataset)
    txt_file_graph = os.path.join(txt_file_graph, 'partition_')
    write_txt_graph(txt_file_graph, 'train.txt', part_dict, g.number_of_nodes(), dataset.n_relations)

    print('there are {} edges in the graph and {} edge cuts for {} partitions.'.format(
        g.number_of_edges(), g.number_of_edges() - tot_num_inner_edges, len(part_dict)))
Esempio n. 25
0
    def __init__(self, dataset, args):
        src = [dataset.train[0]]
        etype_id = [dataset.train[1]]
        dst = [dataset.train[2]]
        self.num_train = len(dataset.train[0])
        if dataset.valid is not None:
            src.append(dataset.valid[0])
            etype_id.append(dataset.valid[1])
            dst.append(dataset.valid[2])
            self.num_valid = len(dataset.valid[0])
        else:
            self.num_valid = 0
        if dataset.test is not None:
            src.append(dataset.test[0])
            etype_id.append(dataset.test[1])
            dst.append(dataset.test[2])
            self.num_test = len(dataset.test[0])
        else:
            self.num_test = 0
        assert len(
            src) > 1, "we need to have at least validation set or test set."
        src = np.concatenate(src)
        etype_id = np.concatenate(etype_id)
        dst = np.concatenate(dst)

        coo = sp.sparse.coo_matrix(
            (np.ones(len(src)), (src, dst)),
            shape=[dataset.n_entities, dataset.n_entities])
        g = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True)
        g.edata['tid'] = F.tensor(etype_id, F.int64)
        self.g = g

        if args.eval_percent < 1:
            self.valid = np.random.randint(
                0,
                self.num_valid,
                size=(int(
                    self.num_valid * args.eval_percent), )) + self.num_train
        else:
            self.valid = np.arange(self.num_train,
                                   self.num_train + self.num_valid)
        print('|valid|:', len(self.valid))

        if args.eval_percent < 1:
            self.test = np.random.randint(
                0,
                self.num_test,
                size=(int(self.num_test * args.eval_percent, )))
            self.test += self.num_train + self.num_valid
        else:
            self.test = np.arange(self.num_train + self.num_valid,
                                  self.g.number_of_edges())
        print('|test|:', len(self.test))
Esempio n. 26
0
    def _split_labels(self, device, valid_ratio=0.1, test_ratio=0.2):
        if len(self._labels) == 1 and self._labels[0].is_homo:
            ids, labels = self._labels[0].id_labels
            ids = F.tensor(ids).to(device)
            labels = F.tensor(labels).to(device)
            num_labels = ids.shape[0]
            idx = np.arange(num_labels)
            np.random.shuffle(idx)
            train_cnt = int((1 - test_ratio) * num_labels)
            train_idx = idx[:train_cnt]
            test_idx = idx[train_cnt:]
            valid_cnt = int(valid_ratio * num_labels)
            valid_idx = train_idx[:valid_cnt]
            train_idx = train_idx[valid_cnt:]

            self._test_set = (ids[test_idx], labels[test_idx])
            self._valid_set = (ids[valid_idx], labels[valid_idx])
            self._train_set = (ids[train_idx], labels[train_idx])
        else:
            # (TODO xiangsx) heto graph
            assert False
Esempio n. 27
0
    def __next__(self):
        """Get next batch

        Returns
        -------
        DGLGraph
            Sampled positive graph
        ChunkNegEdgeSubgraph
            Negative graph wrapper
        """
        if self.cnt == self.num_edges:
            raise StopIteration
        beg = self.cnt
        if self.cnt + self.batch_size > self.num_edges:
            self.cnt = self.num_edges
        else:
            self.cnt += self.batch_size
        if self.mode == 't,r->h':
            return F.tensor(self.edges['t,r->h']['tr'][beg:self.cnt], F.int64), F.tensor(self.edges['t,r->h']['h_correct_index'][beg:self.cnt], F.int64), F.tensor(self.edges['t,r->h']['h_candidate'][beg:self.cnt], F.int64)
        elif self.mode == 'h,r->t':
            return F.tensor(self.edges['h,r->t']['hr'][beg:self.cnt], F.int64), F.tensor(self.edges['h,r->t']['t_correct_index'][beg:self.cnt], F.int64), F.tensor(self.edges['h,r->t']['t_candidate'][beg:self.cnt], F.int64)
Esempio n. 28
0
 def process(self):
     # graph
     coo_adj = sp.load_npz(os.path.join(self._raw_dir, "amazon_graph.npz"))
     self._graph = from_scipy(coo_adj)
     # features and labels
     reddit_data = np.load(os.path.join(self._raw_dir, "amazon_data.npz"))
     features = reddit_data["feature"]
     labels = reddit_data["label"]
     # tarin/val/test indices
     node_types = reddit_data["node_types"]
     train_mask = (node_types == 1)
     val_mask = (node_types == 2)
     test_mask = (node_types == 3)
     self._graph.ndata['train_mask'] = generate_mask_tensor(train_mask)
     self._graph.ndata['val_mask'] = generate_mask_tensor(val_mask)
     self._graph.ndata['test_mask'] = generate_mask_tensor(test_mask)
     self._graph.ndata['feat'] = F.tensor(features,
                                          dtype=F.data_type_dict['float32'])
     self._graph.ndata['label'] = F.tensor(labels,
                                           dtype=F.data_type_dict['int64'])
     self._print_info()
Esempio n. 29
0
 def create_sampler(self, batch_size, neg_sample_size=2, mode='head', num_workers=5,
                    shuffle=True, exclude_positive=False, rank=0):
     EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
     return EdgeSampler(self.g,
                        seed_edges=F.tensor(self.edge_parts[rank]),
                        batch_size=batch_size,
                        neg_sample_size=neg_sample_size,
                        negative_mode=mode,
                        num_workers=num_workers,
                        shuffle=shuffle,
                        exclude_positive=exclude_positive,
                        return_false_neg=False)
Esempio n. 30
0
    def __next__(self):
        """Get next batch

        Returns
        -------
        DGLGraph
            Sampled positive graph
        ChunkNegEdgeSubgraph
            Negative graph wrapper
        """
        if self.cnt == self.num_edges:
            raise StopIteration
        beg = self.cnt
        if self.cnt + self.batch_size > self.num_edges:
            self.cnt = self.num_edges
        else:
            self.cnt += self.batch_size
        if self.mode == "t,r->h":
            return (
                F.tensor(self.edges["t,r->h"]["tr"][beg:self.cnt], F.int64),
                F.tensor(self.edges["t,r->h"]["h_correct_index"][beg:self.cnt],
                         F.int64),
                F.tensor(self.edges["t,r->h"]["h_candidate"][beg:self.cnt],
                         F.int64),
            )
        elif self.mode == "h,r->t":
            return (
                F.tensor(self.edges["h,r->t"]["hr"][beg:self.cnt], F.int64),
                F.tensor(self.edges["h,r->t"]["t_correct_index"][beg:self.cnt],
                         F.int64),
                F.tensor(self.edges["h,r->t"]["t_candidate"][beg:self.cnt],
                         F.int64),
            )