def build_graph(self, mg, src, dst, ntid, etid, ntypes, etypes): # create h**o graph print('Creating one whole graph ...') g = dgl.graph((src, dst)) g.ndata[dgl.NTYPE] = F.tensor(ntid) g.edata[dgl.ETYPE] = F.tensor(etid) print('Total #nodes:', g.number_of_nodes()) print('Total #edges:', g.number_of_edges()) # rename names such as 'type' so that they an be used as keys # to nn.ModuleDict etypes = [RENAME_DICT.get(ty, ty) for ty in etypes] mg_edges = mg.edges(keys=True) mg = nx.MultiDiGraph() for sty, dty, ety in mg_edges: mg.add_edge(sty, dty, key=RENAME_DICT.get(ety, ety)) # convert to heterograph print('Convert to heterograph ...') hg = dgl.to_hetero(g, ntypes, etypes, metagraph=mg) print('#Node types:', len(hg.ntypes)) print('#Canonical edge types:', len(hg.etypes)) print('#Unique edge type names:', len(set(hg.etypes))) self.graph = hg
def create_partition_policy(args): """Create GraphPartitionBook and PartitionPolicy """ g = dgl.DGLGraph() g.add_nodes(args.graph_size) g.add_edge(0, 1) # we don't use edge data in our benchmark global_nid = F.tensor(np.arange(args.graph_size) + args.machine_id * args.graph_size) global_eid = F.tensor([args.machine_id]) node_map = np.zeros((args.graph_size*2), np.int64) node_map[args.graph_size:] = 1 node_map = F.tensor(node_map) edge_map = F.tensor([0,1]) g.ndata[dgl.NID] = global_nid g.edata[dgl.EID] = global_eid gpb = dgl.distributed.GraphPartitionBook(part_id=args.machine_id, num_parts=args.num_machine, node_map=node_map, edge_map=edge_map, part_graph=g) policy = dgl.distributed.PartitionPolicy(policy_str='node', part_id=args.machine_id, partition_book=gpb) return policy, gpb
def init_data(self, name, shape, init_type='zero', low=0.0, high=0.0): """Initialize kvstore tensor Parameters ---------- name : str data name shape : list of int shape of tensor init_type : str initialize method, including 'zero' and 'uniform' low : float min threshold, if use 'uniform' high : float max threshold, if use 'uniform' """ self._data_size[name] = shape[0] count = math.ceil(shape[0] / self._server_count) # We hack the msg format here init_type = 0.0 if init_type == 'zero' else 1.0 threshold = F.tensor([[init_type, init_type], [low, high]]) # partition shape on server for server_id in range(self._server_count): par_shape = shape.copy() if shape[0] - server_id * count >= count: par_shape[0] = count else: par_shape[0] = shape[0] - server_id * count tensor_shape = F.tensor(par_shape) msg = KVStoreMsg(type=KVMsgType.INIT, rank=self._client_id, name=name, id=tensor_shape, data=threshold) _send_kv_msg(self._sender, msg, server_id)
def build_graph(self, mg, src, dst, ntid, etid, ntypes, etypes): """Build the graphs Parameters ---------- mg: MultiDiGraph Input graph src: Numpy array Source nodes dst: Numpy array Destination nodes ntid: Numpy array Node types for each node etid: Numpy array Edge types for each edge ntypes: list Node types etypes: list Edge types Returns ------- g: DGLGraph """ # create h**o graph if self.verbose: print('Creating one whole graph ...') g = dgl.graph((src, dst)) g.ndata[dgl.NTYPE] = F.tensor(ntid) g.edata[dgl.ETYPE] = F.tensor(etid) if self.verbose: print('Total #nodes:', g.number_of_nodes()) print('Total #edges:', g.number_of_edges()) # rename names such as 'type' so that they an be used as keys # to nn.ModuleDict etypes = [RENAME_DICT.get(ty, ty) for ty in etypes] mg_edges = mg.edges(keys=True) mg = nx.MultiDiGraph() for sty, dty, ety in mg_edges: mg.add_edge(sty, dty, key=RENAME_DICT.get(ety, ety)) # convert to heterograph if self.verbose: print('Convert to heterograph ...') hg = dgl.to_hetero(g, ntypes, etypes, metagraph=mg) if self.verbose: print('#Node types:', len(hg.ntypes)) print('#Canonical edge types:', len(hg.etypes)) print('#Unique edge type names:', len(set(hg.etypes))) return hg
def alchemy_edges(mol, self_loop=False): """Featurization for all bonds in a molecule. The bond indices will be preserved. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule object self_loop : bool Whether to add self loops. Default to be False. Returns ------- bond_feats_dict : dict Dictionary for bond features """ bond_feats_dict = defaultdict(list) mol_conformers = mol.GetConformers() assert len(mol_conformers) == 1 geom = mol_conformers[0].GetPositions() num_atoms = mol.GetNumAtoms() for u in range(num_atoms): for v in range(num_atoms): if u == v and not self_loop: continue e_uv = mol.GetBondBetweenAtoms(u, v) if e_uv is None: bond_type = None else: bond_type = e_uv.GetBondType() bond_feats_dict["e_feat"].append( [ float(bond_type == x) for x in ( Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC, None, ) ] ) bond_feats_dict["distance"].append(np.linalg.norm(geom[u] - geom[v])) bond_feats_dict["e_feat"] = FF.tensor( np.array(bond_feats_dict["e_feat"]).astype(np.float32) ) bond_feats_dict["distance"] = FF.tensor( np.array(bond_feats_dict["distance"]).astype(np.float32) ).reshape(-1, 1) return bond_feats_dict
def load_data_split(self, ent2id): label_dict = {} labels = np.zeros( (self.graph.number_of_nodes(self.predict_category), )) - 1 train_idx = self.parse_idx_file( os.path.join(self._dir, 'trainingSet.tsv'), ent2id, label_dict, labels) test_idx = self.parse_idx_file(os.path.join(self._dir, 'testSet.tsv'), ent2id, label_dict, labels) self.train_idx = F.tensor(train_idx) self.test_idx = F.tensor(test_idx) self.labels = F.tensor(labels).long() self.num_classes = len(label_dict)
def generate_samplers(self): # Generate train samplers train_samplers = [] for i in range(self.num_proc): print("Building training sampler for proc %d" % i) t1 = time.time() # for each GPU, allocate num_proc // num_GPU processes train_sampler_head = dgl.contrib.sampling.EdgeSampler( self.train_data, seed_edges=F.tensor(self.train_data.edge_parts[i]), batch_size=self.batch_size, neg_sample_size=self.neg_sample_size, chunk_size=self.neg_sample_size, negative_mode='head', num_workers=self.num_workers, shuffle=True, exclude_positive=False, return_false_neg=False, ) train_sampler_tail = dgl.contrib.sampling.EdgeSampler( self.train_data, seed_edges=F.tensor(self.train_data.edge_parts[i]), batch_size=self.batch_size, neg_sample_size=self.neg_sample_size, chunk_size=self.neg_sample_size, negative_mode='tail', num_workers=self.num_workers, shuffle=True, exclude_positive=False, return_false_neg=False, ) print(train_sampler_head) print(train_sampler_tail) train_samplers.append( NewBidirectionalOneShotIterator( dataloader_head=train_sampler_head, dataloader_tail=train_sampler_tail, neg_chunk_size=self.neg_sample_size, neg_sample_size=self.neg_sample_size, is_chunked=True, num_nodes=self.n_entities, has_edge_importance=self.has_edge_importance, )) print("Training sampler for proc {} created, it takes {} seconds". format(i, time.time() - t1))
def create_range_partition_policy(args): """Create RangePartitionBook and PartitionPolicy """ node_map = F.tensor(np.array([args.graph_size, 2*args.graph_size], np.int64)) edge_map = F.tensor([1,2]) gpb = dgl.distributed.RangePartitionBook(part_id=args.machine_id, num_parts=args.num_machine, node_map=node_map, edge_map=edge_map) policy = dgl.distributed.PartitionPolicy(policy_str='node', partition_book=gpb) return policy, gpb
def load_cache(self): mg = nx.read_gpickle(os.path.join(self._dir, 'cached_mg.gpickle')) src = np.load(os.path.join(self._dir, 'cached_src.npy')) dst = np.load(os.path.join(self._dir, 'cached_dst.npy')) ntid = np.load(os.path.join(self._dir, 'cached_ntid.npy')) etid = np.load(os.path.join(self._dir, 'cached_etid.npy')) ntypes = load_strlist(os.path.join(self._dir, 'cached_ntypes.txt')) etypes = load_strlist(os.path.join(self._dir, 'cached_etypes.txt')) self.train_idx = F.tensor(np.load(os.path.join(self._dir, 'cached_train_idx.npy'))) self.test_idx = F.tensor(np.load(os.path.join(self._dir, 'cached_test_idx.npy'))) labels = np.load(os.path.join(self._dir, 'cached_labels.npy')) self.num_classes = labels.max() + 1 self.labels = F.tensor(labels) self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes)
def pull_model(self, client, pos_g, neg_g): with th.no_grad(): entity_id = F.cat(seq=[pos_g.ndata["id"], neg_g.ndata["id"]], dim=0) relation_id = pos_g.edata["id"] entity_id = F.tensor(np.unique(F.asnumpy(entity_id))) relation_id = F.tensor(np.unique(F.asnumpy(relation_id))) l2g = client.get_local2global() global_entity_id = l2g[entity_id] entity_data = client.pull(name="entity_emb", id_tensor=global_entity_id) relation_data = client.pull(name="relation_emb", id_tensor=relation_id) self.entity_emb.emb[entity_id] = entity_data self.relation_emb.emb[relation_id] = relation_data
def build_training_dataset(self): # construct the training Graph print("To build training dataset") t1 = time.time() # extract source, edge typy and target from the triplets src, etype_id, dst = self.train coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)), shape=[self.n_entities, self.n_entities]) self.train_data = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True) self.train_data.edata['tid'] = F.tensor(etype_id, F.int64) self.train_data.edge_parts = CustomPartition( edges=self.train, n=self.num_proc, p=0.1, has_importance=self.has_edge_importance) # self.train_data.edge_parts = RandomPartition(edges=self.train, n=self.num_proc, has_importance=self.has_edge_importance) print(self.train_data.edge_parts[0]) print(len(self.train_data.edge_parts[0])) self.train_data.cross_part = True print("Training dataset built, it takes %d seconds" % (time.time() - t1))
def build_graph(self, mg, src, dst, ntid, etid, ntypes, etypes): # create h**o graph print('Creating one whole graph ...') g = dgl.graph((src, dst)) g.ndata[dgl.NTYPE] = F.tensor(ntid) g.edata[dgl.ETYPE] = F.tensor(etid) print('Total #nodes:', g.number_of_nodes()) print('Total #edges:', g.number_of_edges()) # convert to heterograph print('Convert to heterograph ...') hg = dgl.to_hetero(g, ntypes, etypes, metagraph=mg) print('#Node types:', len(hg.ntypes)) print('#Canonical edge types:', len(hg.etypes)) print('#Unique edge type names:', len(set(hg.etypes))) self.graph = hg
def _load(self): # load vocab file self.vocab = OrderedDict() with open(self.vocab_file, encoding='utf-8') as vf: for line in vf.readlines(): line = line.strip() self.vocab[line] = len(self.vocab) # filter glove if self.pretrained_file != '' and os.path.exists(self.pretrained_file): glove_emb = {} with open(self.pretrained_file, 'r', encoding='utf-8') as pf: for line in pf.readlines(): sp = line.split(' ') if sp[0].lower() in self.vocab: glove_emb[sp[0].lower()] = np.array([float(x) for x in sp[1:]]) files = ['{}.txt'.format(self.mode)] corpus = BracketParseCorpusReader('{}/sst'.format(self.dir), files) sents = corpus.parsed_sents(files[0]) #initialize with glove pretrained_emb = [] fail_cnt = 0 for line in self.vocab.keys(): if self.pretrained_file != '' and os.path.exists(self.pretrained_file): if not line.lower() in glove_emb: fail_cnt += 1 pretrained_emb.append(glove_emb.get(line.lower(), np.random.uniform(-0.05, 0.05, 300))) if self.pretrained_file != '' and os.path.exists(self.pretrained_file): self.pretrained_emb = F.tensor(np.stack(pretrained_emb, 0)) print('Miss word in GloVe {0:.4f}'.format(1.0*fail_cnt/len(self.pretrained_emb))) # build trees for sent in sents: self.trees.append(self._build_tree(sent))
def ConstructGraph(edges, n_entities, args): """Construct Graph for training Parameters ---------- edges : (heads, rels, tails) triple Edge list n_entities : int number of entities args : Global configs. """ pickle_name = 'graph_train.pickle' if args.pickle_graph and os.path.exists( os.path.join(args.data_path, args.dataset, pickle_name)): with open(os.path.join(args.data_path, args.dataset, pickle_name), 'rb') as graph_file: g = pickle.load(graph_file) print('Load pickled graph.') else: src, etype_id, dst = edges coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)), shape=[n_entities, n_entities]) g = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True) g.edata['tid'] = F.tensor(etype_id, F.int64) if args.pickle_graph: with open(os.path.join(args.data_path, args.dataset, pickle_name), 'wb') as graph_file: pickle.dump(g, graph_file) return g
def generate_rand_graph(n, func_name): arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype( np.int64) g = dgl.DGLGraph(arr, readonly=True) num_rels = 10 entity_emb = F.uniform((g.number_of_nodes(), 10), F.float32, F.cpu(), 0, 1) if func_name == 'RotatE': entity_emb = F.uniform((g.number_of_nodes(), 20), F.float32, F.cpu(), 0, 1) rel_emb = F.uniform((num_rels, 10), F.float32, F.cpu(), -1, 1) if func_name == 'RESCAL': rel_emb = F.uniform((num_rels, 10 * 10), F.float32, F.cpu(), 0, 1) g.ndata['id'] = F.arange(0, g.number_of_nodes()) rel_ids = np.random.randint(0, num_rels, g.number_of_edges(), dtype=np.int64) g.edata['id'] = F.tensor(rel_ids, F.int64) # TransR have additional projection_emb if (func_name == 'TransR'): args = {'gpu': -1, 'lr': 0.1} args = dotdict(args) projection_emb = ExternalEmbedding(args, 10, 10 * 10, F.cpu()) return g, entity_emb, rel_emb, (12.0, projection_emb, 10, 10) elif (func_name == 'TransE'): return g, entity_emb, rel_emb, (12.0) elif (func_name == 'RESCAL'): return g, entity_emb, rel_emb, (10, 10) elif (func_name == 'RotatE'): return g, entity_emb, rel_emb, (12.0, 1.0) else: return g, entity_emb, rel_emb, None
def start_server(args): """Start kvstore service """ server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config) my_server = KVServer(server_id=args.server_id, server_namebook=server_namebook, num_client=args.num_client) data = F.zeros((num_entries, args.dim_size), F.float32, F.cpu()) g2l = F.zeros(num_entries * args.num_servers, F.int64, F.cpu()) start = num_entries * my_server.get_machine_id() end = num_entries * (my_server.get_machine_id() + 1) g2l[start:end] = F.arange(0, num_entries) partition = np.arange(args.num_servers) partition = F.tensor(np.repeat(partition, num_entries)) if my_server.get_id() % my_server.get_group_count() == 0: # master server my_server.set_global2local(name='entity_embed', global2local=g2l) my_server.init_data(name='entity_embed', data_tensor=data) my_server.set_partition_book(name='entity_embed', partition_book=partition) else: my_server.set_global2local(name='entity_embed') my_server.init_data(name='entity_embed') my_server.set_partition_book(name='entity_embed') my_server.print() my_server.start()
def split_dataset(self, dataset_pairs, label_dict, glb2lcl): total = len(dataset_pairs) train_set_size = int(total * 0.8) entities, truths = zip(*dataset_pairs) local_entities = [] labels = np.zeros((self.graph.number_of_nodes(self.predict_category),)) - 1 for i, entity in enumerate(entities): local_id = glb2lcl[entity] local_entities.append(local_id) labels[local_id] = truths[i] train_entities = local_entities[:train_set_size] test_entities = local_entities[train_set_size:] self.train_idx = F.tensor(train_entities) self.test_idx = F.tensor(test_entities) self.labels = F.tensor(labels).long() self.num_classes = len(label_dict)
def process(self): DS_edge_list = self._idx_from_zero( loadtxt(self._file_path("A"), delimiter=",").astype(int)) DS_indicator = self._idx_from_zero( loadtxt(self._file_path("graph_indicator"), delimiter=",").astype(int)) DS_graph_labels = self._idx_reset( loadtxt(self._file_path("graph_labels"), delimiter=",").astype(int)) g = dgl_graph(([], [])) g.add_nodes(int(DS_edge_list.max()) + 1) g.add_edges(DS_edge_list[:, 0], DS_edge_list[:, 1]) node_idx_list = [] self.max_num_node = 0 for idx in range(np.max(DS_indicator) + 1): node_idx = np.where(DS_indicator == idx) node_idx_list.append(node_idx[0]) if len(node_idx[0]) > self.max_num_node: self.max_num_node = len(node_idx[0]) self.num_labels = max(DS_graph_labels) + 1 self.graph_labels = F.tensor(DS_graph_labels) self.attr_dict = { 'node_labels': ('ndata', 'node_labels'), 'node_attributes': ('ndata', 'node_attr'), 'edge_labels': ('edata', 'edge_labels'), 'edge_attributes': ('edata', 'node_labels'), } for filename, field_name in self.attr_dict.items(): try: data = loadtxt(self._file_path(filename), delimiter=',').astype(float) if 'label' in filename: data = F.tensor(self._idx_from_zero(data)) else: data = F.tensor(data) getattr(g, field_name[0])[field_name[1]] = data except IOError: pass self.graph_lists = [g.subgraph(node_idx) for node_idx in node_idx_list]
def _load_node_feature(self, device): if len(self._features) == 1 and self._features[0].is_homo: features = self._features[0] ft = F.tensor(features.features) ft = F.copy_to(ft, device) self._g.ndata['homo_f'] = ft else: # (TODO xiangsx) heto graph assert False
def __init__(self, dataset, args): pickle_name = 'graph_all.pickle' if args.pickle_graph and os.path.exists( os.path.join(args.data_path, args.dataset, pickle_name)): with open(os.path.join(args.data_path, args.dataset, pickle_name), 'rb') as graph_file: g = pickle.load(graph_file) print('Load pickled graph.') else: src = np.concatenate( (dataset.train[0], dataset.valid[0], dataset.test[0])) etype_id = np.concatenate( (dataset.train[1], dataset.valid[1], dataset.test[1])) dst = np.concatenate( (dataset.train[2], dataset.valid[2], dataset.test[2])) coo = sp.sparse.coo_matrix( (np.ones(len(src)), (src, dst)), shape=[dataset.n_entities, dataset.n_entities]) g = dgl.DGLGraph(coo, readonly=True, sort_csr=True) g.ndata['id'] = F.arange(0, g.number_of_nodes()) g.edata['id'] = F.tensor(etype_id, F.int64) if args.pickle_graph: with open( os.path.join(args.data_path, args.dataset, pickle_name), 'wb') as graph_file: pickle.dump(g, graph_file) self.g = g self.num_train = len(dataset.train[0]) self.num_valid = len(dataset.valid[0]) self.num_test = len(dataset.test[0]) if args.eval_percent < 1: self.valid = np.random.randint( 0, self.num_valid, size=(int( self.num_valid * args.eval_percent), )) + self.num_train else: self.valid = np.arange(self.num_train, self.num_train + self.num_valid) print('|valid|:', len(self.valid)) if args.eval_percent < 1: self.test = np.random.randint( 0, self.num_test, size=(int(self.num_test * args.eval_percent, ))) self.test += self.num_train + self.num_valid else: self.test = np.arange(self.num_train + self.num_valid, self.g.number_of_edges()) print('|test|:', len(self.test)) self.num_valid = len(self.valid) self.num_test = len(self.test)
def create_sampler( self, batch_size, neg_sample_size=2, neg_chunk_size=None, mode="head", num_workers=32, shuffle=True, exclude_positive=False, rank=0, ): """Create sampler for training Parameters ---------- batch_size : int Batch size of each mini batch. neg_sample_size : int How many negative edges sampled for each node. neg_chunk_size : int How many edges in one chunk. We split one batch into chunks. mode : str Sampling mode. number_workers: int Number of workers used in parallel for this sampler shuffle : bool If True, shuffle the seed edges. If False, do not shuffle the seed edges. Default: False exclude_positive : bool If True, exlucde true positive edges in sampled negative edges If False, return all sampled negative edges even there are positive edges Default: False rank : int Which partition to sample. Returns ------- dgl.contrib.sampling.EdgeSampler Edge sampler """ EdgeSampler = getattr(dgl.contrib.sampling, "EdgeSampler") assert batch_size % neg_sample_size == 0, "batch_size should be divisible by B" return EdgeSampler( self.g, seed_edges=F.tensor(self.edge_parts[rank]), batch_size=batch_size, neg_sample_size=int(neg_sample_size / neg_chunk_size), chunk_size=neg_chunk_size, negative_mode=mode, num_workers=num_workers, shuffle=shuffle, exclude_positive=exclude_positive, return_false_neg=False, )
def __init__(self, dataset, args): triples = dataset.train + dataset.valid + dataset.test pickle_name = "graph_all.pickle" if args.pickle_graph and os.path.exists( os.path.join(args.data_path, args.dataset, pickle_name)): with open(os.path.join(args.data_path, args.dataset, pickle_name), "rb") as graph_file: g = pickle.load(graph_file) print("Load pickled graph.") else: src = [t[0] for t in triples] etype_id = [t[1] for t in triples] dst = [t[2] for t in triples] coo = sp.sparse.coo_matrix( (np.ones(len(src)), (src, dst)), shape=[dataset.n_entities, dataset.n_entities]) g = dgl.DGLGraph(coo, readonly=True, sort_csr=True) g.ndata["id"] = F.arange(0, g.number_of_nodes()) g.edata["id"] = F.tensor(etype_id, F.int64) if args.pickle_graph: with open( os.path.join(args.data_path, args.dataset, pickle_name), "wb") as graph_file: pickle.dump(g, graph_file) self.g = g self.num_train = len(dataset.train) self.num_valid = len(dataset.valid) self.num_test = len(dataset.test) if args.eval_percent < 1: self.valid = (np.random.randint( 0, self.num_valid, size=(int(self.num_valid * args.eval_percent), )) + self.num_train) else: self.valid = np.arange(self.num_train, self.num_train + self.num_valid) print("|valid|:", len(self.valid)) if args.eval_percent < 1: self.test = np.random.randint( 0, self.num_test, size=(int(self.num_test * args.eval_percent, )), ) self.test += self.num_train + self.num_valid else: self.test = np.arange(self.num_train + self.num_valid, self.g.number_of_edges()) print("|test|:", len(self.test)) self.num_valid = len(self.valid) self.num_test = len(self.test)
def _load(self, mol_to_graph, node_featurizer, edge_featurizer): if self.load: self.graphs, label_dict = load_graphs( osp.join(self.file_dir, "{}_graphs.bin".format(self.mode))) self.labels = label_dict['labels'] with open( osp.join(self.file_dir, "{}_smiles.txt".format(self.mode)), 'r') as f: smiles_ = f.readlines() self.smiles = [s.strip() for s in smiles_] else: print('Start preprocessing dataset...') target_file = pathlib.Path(self.file_dir, "{}_target.csv".format(self.mode)) self.target = pd.read_csv( target_file, index_col=0, usecols=[ 'gdb_idx', ] + ['property_{:d}'.format(x) for x in range(12)]) self.target = self.target[[ 'property_{:d}'.format(x) for x in range(12) ]] self.graphs, self.labels, self.smiles = [], [], [] supp = Chem.SDMolSupplier( osp.join(self.file_dir, self.mode + ".sdf")) cnt = 0 dataset_size = len(self.target) for mol, label in zip(supp, self.target.iterrows()): cnt += 1 print('Processing molecule {:d}/{:d}'.format( cnt, dataset_size)) graph = mol_to_graph(mol, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer) smiles = Chem.MolToSmiles(mol) self.smiles.append(smiles) self.graphs.append(graph) label = F.tensor( np.array(label[1].tolist()).astype(np.float32)) self.labels.append(label) save_graphs(osp.join(self.file_dir, "{}_graphs.bin".format(self.mode)), self.graphs, labels={'labels': F.stack(self.labels, dim=0)}) with open( osp.join(self.file_dir, "{}_smiles.txt".format(self.mode)), 'w') as f: for s in self.smiles: f.write(s + '\n') self.set_mean_and_std() print(len(self.graphs), "loaded!")
def main(): parser = argparse.ArgumentParser(description='Partition a knowledge graph') parser.add_argument('--data_path', type=str, default='data', help='root path of all dataset') parser.add_argument('--dataset', type=str, default='FB15k', help='dataset name, under data_path') parser.add_argument('--data_files', type=str, default=None, nargs='+', help='a list of data files, e.g. entity relation train valid test') parser.add_argument('--format', type=str, default='built_in', help='the format of the dataset, it can be built_in,'\ 'raw_udd_{htr} and udd_{htr}') parser.add_argument('-k', '--num-parts', required=True, type=int, help='The number of partitions') args = parser.parse_args() num_parts = args.num_parts print('load dataset..') # load dataset and samplers dataset = get_dataset(args.data_path, args.dataset, args.format, args.data_files) print('construct graph...') src, etype_id, dst = dataset.train coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)), shape=[dataset.n_entities, dataset.n_entities]) g = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True) g.edata['tid'] = F.tensor(etype_id, F.int64) print('partition graph...') part_dict = dgl.transform.metis_partition(g, num_parts, 1) tot_num_inner_edges = 0 for part_id in part_dict: part = part_dict[part_id] num_inner_nodes = len(np.nonzero(F.asnumpy(part.ndata['inner_node']))[0]) num_inner_edges = len(np.nonzero(F.asnumpy(part.edata['inner_edge']))[0]) print('part {} has {} nodes and {} edges. {} nodes and {} edges are inside the partition'.format( part_id, part.number_of_nodes(), part.number_of_edges(), num_inner_nodes, num_inner_edges)) tot_num_inner_edges += num_inner_edges part.copy_from_parent() print('write graph to txt file...') txt_file_graph = os.path.join(args.data_path, args.dataset) txt_file_graph = os.path.join(txt_file_graph, 'partition_') write_txt_graph(txt_file_graph, 'train.txt', part_dict, g.number_of_nodes(), dataset.n_relations) print('there are {} edges in the graph and {} edge cuts for {} partitions.'.format( g.number_of_edges(), g.number_of_edges() - tot_num_inner_edges, len(part_dict)))
def __init__(self, dataset, args): src = [dataset.train[0]] etype_id = [dataset.train[1]] dst = [dataset.train[2]] self.num_train = len(dataset.train[0]) if dataset.valid is not None: src.append(dataset.valid[0]) etype_id.append(dataset.valid[1]) dst.append(dataset.valid[2]) self.num_valid = len(dataset.valid[0]) else: self.num_valid = 0 if dataset.test is not None: src.append(dataset.test[0]) etype_id.append(dataset.test[1]) dst.append(dataset.test[2]) self.num_test = len(dataset.test[0]) else: self.num_test = 0 assert len( src) > 1, "we need to have at least validation set or test set." src = np.concatenate(src) etype_id = np.concatenate(etype_id) dst = np.concatenate(dst) coo = sp.sparse.coo_matrix( (np.ones(len(src)), (src, dst)), shape=[dataset.n_entities, dataset.n_entities]) g = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True) g.edata['tid'] = F.tensor(etype_id, F.int64) self.g = g if args.eval_percent < 1: self.valid = np.random.randint( 0, self.num_valid, size=(int( self.num_valid * args.eval_percent), )) + self.num_train else: self.valid = np.arange(self.num_train, self.num_train + self.num_valid) print('|valid|:', len(self.valid)) if args.eval_percent < 1: self.test = np.random.randint( 0, self.num_test, size=(int(self.num_test * args.eval_percent, ))) self.test += self.num_train + self.num_valid else: self.test = np.arange(self.num_train + self.num_valid, self.g.number_of_edges()) print('|test|:', len(self.test))
def _split_labels(self, device, valid_ratio=0.1, test_ratio=0.2): if len(self._labels) == 1 and self._labels[0].is_homo: ids, labels = self._labels[0].id_labels ids = F.tensor(ids).to(device) labels = F.tensor(labels).to(device) num_labels = ids.shape[0] idx = np.arange(num_labels) np.random.shuffle(idx) train_cnt = int((1 - test_ratio) * num_labels) train_idx = idx[:train_cnt] test_idx = idx[train_cnt:] valid_cnt = int(valid_ratio * num_labels) valid_idx = train_idx[:valid_cnt] train_idx = train_idx[valid_cnt:] self._test_set = (ids[test_idx], labels[test_idx]) self._valid_set = (ids[valid_idx], labels[valid_idx]) self._train_set = (ids[train_idx], labels[train_idx]) else: # (TODO xiangsx) heto graph assert False
def __next__(self): """Get next batch Returns ------- DGLGraph Sampled positive graph ChunkNegEdgeSubgraph Negative graph wrapper """ if self.cnt == self.num_edges: raise StopIteration beg = self.cnt if self.cnt + self.batch_size > self.num_edges: self.cnt = self.num_edges else: self.cnt += self.batch_size if self.mode == 't,r->h': return F.tensor(self.edges['t,r->h']['tr'][beg:self.cnt], F.int64), F.tensor(self.edges['t,r->h']['h_correct_index'][beg:self.cnt], F.int64), F.tensor(self.edges['t,r->h']['h_candidate'][beg:self.cnt], F.int64) elif self.mode == 'h,r->t': return F.tensor(self.edges['h,r->t']['hr'][beg:self.cnt], F.int64), F.tensor(self.edges['h,r->t']['t_correct_index'][beg:self.cnt], F.int64), F.tensor(self.edges['h,r->t']['t_candidate'][beg:self.cnt], F.int64)
def process(self): # graph coo_adj = sp.load_npz(os.path.join(self._raw_dir, "amazon_graph.npz")) self._graph = from_scipy(coo_adj) # features and labels reddit_data = np.load(os.path.join(self._raw_dir, "amazon_data.npz")) features = reddit_data["feature"] labels = reddit_data["label"] # tarin/val/test indices node_types = reddit_data["node_types"] train_mask = (node_types == 1) val_mask = (node_types == 2) test_mask = (node_types == 3) self._graph.ndata['train_mask'] = generate_mask_tensor(train_mask) self._graph.ndata['val_mask'] = generate_mask_tensor(val_mask) self._graph.ndata['test_mask'] = generate_mask_tensor(test_mask) self._graph.ndata['feat'] = F.tensor(features, dtype=F.data_type_dict['float32']) self._graph.ndata['label'] = F.tensor(labels, dtype=F.data_type_dict['int64']) self._print_info()
def create_sampler(self, batch_size, neg_sample_size=2, mode='head', num_workers=5, shuffle=True, exclude_positive=False, rank=0): EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler') return EdgeSampler(self.g, seed_edges=F.tensor(self.edge_parts[rank]), batch_size=batch_size, neg_sample_size=neg_sample_size, negative_mode=mode, num_workers=num_workers, shuffle=shuffle, exclude_positive=exclude_positive, return_false_neg=False)
def __next__(self): """Get next batch Returns ------- DGLGraph Sampled positive graph ChunkNegEdgeSubgraph Negative graph wrapper """ if self.cnt == self.num_edges: raise StopIteration beg = self.cnt if self.cnt + self.batch_size > self.num_edges: self.cnt = self.num_edges else: self.cnt += self.batch_size if self.mode == "t,r->h": return ( F.tensor(self.edges["t,r->h"]["tr"][beg:self.cnt], F.int64), F.tensor(self.edges["t,r->h"]["h_correct_index"][beg:self.cnt], F.int64), F.tensor(self.edges["t,r->h"]["h_candidate"][beg:self.cnt], F.int64), ) elif self.mode == "h,r->t": return ( F.tensor(self.edges["h,r->t"]["hr"][beg:self.cnt], F.int64), F.tensor(self.edges["h,r->t"]["t_correct_index"][beg:self.cnt], F.int64), F.tensor(self.edges["h,r->t"]["t_candidate"][beg:self.cnt], F.int64), )