def fast_train_mp(self): """ multi-cpu-core or mix cpu & multi-gpu """ self.init_device_emb() self.emb_model.share_memory() sum_up_params(self.emb_model) start_all = time.time() ps = [] for i in range(len(self.args.gpus)): p = mp.Process(target=self.fast_train_sp, args=(i, self.args.gpus[i])) ps.append(p) p.start() for p in ps: p.join() print("Used time: %.2fs" % (time.time() - start_all)) if self.args.save_in_pt: self.emb_model.save_embedding_pt(self.dataset, self.args.output_emb_file) else: self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)
def create_async_update(self): """ Set up the async update subprocess. """ self.async_q = Queue(1) self.async_p = mp.Process(target=async_update, args=(self.num_threads, self, self.async_q)) self.async_p.start()
def main(args, devices): # load reddit data data = RedditDataset(self_loop=False) n_classes = data.num_classes g = data[0] train_mask = g.ndata['train_mask'] val_mask = g.ndata['val_mask'] test_mask = g.ndata['test_mask'] # Create csr/coo/csc formats before launching training processes with multi-gpu. # This avoids creating certain formats in each sub-process, which saves memory and CPU. g.create_formats_() # Pack data data = train_mask, val_mask, test_mask, n_classes, g n_gpus = len(devices) if devices[0] == -1: run(0, 0, args, ['cpu'], data) elif n_gpus == 1: run(0, n_gpus, args, devices, data) else: procs = [] for proc_id in range(n_gpus): p = mp.Process(target=run, args=(proc_id, n_gpus, args, devices, data)) p.start() procs.append(p) for p in procs: p.join()
def main(args): devices = list(map(int, args.gpu.split(','))) n_gpus = len(devices) # load dataset if args.dataset == 'reddit': g, n_classes = load_reddit(self_loop=False) elif args.dataset == 'ogbn-products': g, n_classes = load_ogb('ogbn-products') else: raise Exception('unknown dataset') train_nid = g.ndata.pop('train_mask').nonzero().squeeze() val_nid = g.ndata.pop('val_mask').nonzero().squeeze() test_nid = g.ndata.pop('test_mask').nonzero().squeeze() nfeat = g.ndata.pop('features') labels = g.ndata.pop('labels') # Create csr/coo/csc formats before launching training processes with multi-gpu. # This avoids creating certain formats in each sub-process, which saves memory and CPU. g.create_formats_() # this to avoid competition overhead on machines with many cores. # Change it to a proper number on your machine, especially for multi-GPU training. os.environ['OMP_NUM_THREADS'] = str(mp.cpu_count() // 2 // n_gpus) if n_gpus > 1: # Copy the graph to shared memory explicitly before pinning. # In other cases, we can just rely on fork's copy-on-write. # TODO: the original graph g is not freed. if args.graph_device == 'uva': g = g.shared_memory('g') if args.data_device == 'uva': nfeat = nfeat.share_memory_() labels = labels.share_memory_() # Pack data data = train_nid, val_nid, test_nid, n_classes, g, nfeat, labels if devices[0] == -1: assert args.graph_device == 'cpu', \ f"Must have GPUs to enable {args.graph_device} sampling." assert args.data_device == 'cpu', \ f"Must have GPUs to enable {args.data_device} feature storage." run(0, 0, args, ['cpu'], data) elif n_gpus == 1: run(0, n_gpus, args, devices, data) else: procs = [] for proc_id in range(n_gpus): p = mp.Process(target=run, args=(proc_id, n_gpus, args, devices, data)) p.start() procs.append(p) for p in procs: p.join()
def main(args, devices): g, num_rels, num_classes, labels, train_idx, test_idx, target_idx, inv_target = load_data( args.dataset, inv_target=True) # Create csr/coo/csc formats before launching training processes. # This avoids creating certain formats in each sub-process, which saves momory and CPU. g.create_formats_() n_gpus = len(devices) n_cpus = mp.cpu_count() queue = mp.Queue(n_gpus) procs = [] for proc_id in range(n_gpus): # We use distributed data parallel dataloader to handle the data splitting p = mp.Process(target=run, args=(proc_id, n_gpus, n_cpus // n_gpus, args, devices, (g, num_classes, num_rels, target_idx, inv_target, train_idx, test_idx, labels), queue)) p.start() procs.append(p) for p in procs: p.join()
def train_model(network_data): index2word, vocab, type_nodes = generate_vocab(network_data) edge_types = list(network_data.keys()) num_nodes = len(index2word) edge_type_count = len(edge_types) devices = list(map(int, args.gpu.split(","))) n_gpus = len(devices) neighbor_samples = args.neighbor_samples num_workers = args.workers g = get_graph(network_data, vocab) all_walks = [] for i in range(edge_type_count): nodes = torch.LongTensor(type_nodes[i] * args.num_walks) traces, types = dgl.sampling.random_walk(g, nodes, metapath=[edge_types[i]] * (neighbor_samples - 1)) all_walks.append(traces) train_pairs = generate_pairs(all_walks, args.window_size, num_workers) data = g, train_pairs, index2word, edge_types, num_nodes, edge_type_count if n_gpus == 1: run(0, n_gpus, args, devices, data) else: procs = [] for proc_id in range(n_gpus): p = mp.Process(target=run, args=(proc_id, n_gpus, args, devices, data)) p.start() procs.append(p) for p in procs: p.join()
else: raise Exception('unknown dataset') # Construct graph g = dgl.as_heterograph(g) if args.inductive: train_g, val_g, test_g = inductive_split(g) else: train_g = val_g = test_g = g # Create csr/coo/csc formats before launching training processes with multi-gpu. # This avoids creating certain formats in each sub-process, which saves momory and CPU. train_g.create_formats_() val_g.create_formats_() test_g.create_formats_() # Pack data data = n_classes, train_g, val_g, test_g if n_gpus == 1: run(0, n_gpus, args, devices, data) else: procs = [] for proc_id in range(n_gpus): p = mp.Process(target=run, args=(proc_id, n_gpus, args, devices, data)) p.start() procs.append(p) for p in procs: p.join()
test_acc = evaluate(model, test_loader, device) print('Test acc: {:.4f}'.format(test_acc)) dist.destroy_process_group() ############################################################################### # Finally we load the dataset and launch the processes. # # .. note:: # # You will need to use ``dgl.multiprocessing`` instead of the Python # ``multiprocessing`` package. ``dgl.multiprocessing`` is identical to # Python’s built-in ``multiprocessing`` except that it handles the # subtleties between forking and multithreading in Python. # if __name__ == '__main__': import dgl.multiprocessing as mp from dgl.data import GINDataset num_gpus = 4 procs = [] dataset = GINDataset(name='IMDBBINARY', self_loop=False) for rank in range(num_gpus): p = mp.Process(target=main, args=(rank, num_gpus, dataset)) p.start() procs.append(p) for p in procs: p.join()
print("make sure the number of gpus greater than 1!") sys.exit() dataset = MAG240MDataset(root=args.rootdir) print('Loading graph') (g, ), _ = dgl.load_graphs(args.graph_path) g = g.formats(['csc']) print('Loading features') paper_offset = dataset.num_authors + dataset.num_institutions num_nodes = paper_offset + dataset.num_papers num_features = dataset.num_paper_features feats = np.memmap(args.full_feature_path, mode='r', dtype='float16', shape=(num_nodes, num_features)) procs = [] for proc_id in range(n_gpus): p = mp.Process(target=train, args=(proc_id, n_gpus, args, dataset, g, feats, paper_offset)) p.start() procs.append(p) for p in procs: p.join() test(args, dataset, g, feats, paper_offset)
# method: # graph.create_formats_() ###################################################################### # Then you can spawn the subprocesses to train with multiple GPUs. # # .. note:: # # You will need to use ``dgl.multiprocessing`` instead of the Python # ``multiprocessing`` package. ``dgl.multiprocessing`` is identical to # Python’s built-in ``multiprocessing`` except that it handles the # subtleties between forking and multithreading in Python. # # Say you have four GPUs. num_gpus = 4 import dgl.multiprocessing as mp devices = list(range(num_gpus)) procs = [] for proc_id in range(num_gpus): p = mp.Process(target=run, args=(proc_id, devices)) p.start() procs.append(p) for p in procs: p.join() # Thumbnail credits: Stanford CS224W Notes # sphinx_gallery_thumbnail_path = '_static/blitz_1_introduction.png'
tt = time.time() if rank == 0: print(tt - t0) durations.append(tt - t0) if rank == 0: print(np.mean(durations[4:]), np.std(durations[4:])) if __name__ == '__main__': dataset = DglNodePropPredDataset('ogbn-products') graph, labels = dataset[0] graph.ndata['label'] = labels graph.create_formats_() split_idx = dataset.get_idx_split() num_classes = dataset.num_classes n_procs = 4 # Tested with mp.spawn and fork. Both worked and got 4s per epoch with 4 GPUs # and 3.86s per epoch with 8 GPUs on p2.8x, compared to 5.2s from official examples. #import torch.multiprocessing as mp #mp.spawn(train, args=(n_procs, graph, num_classes, split_idx), nprocs=n_procs) import dgl.multiprocessing as mp procs = [] for i in range(n_procs): p = mp.Process(target=train, args=(i, n_procs, graph, num_classes, split_idx)) p.start() procs.append(p) for p in procs: p.join()
def main(args, devices): # load graph data ogb_dataset = False if args.dataset == 'aifb': dataset = AIFBDataset() elif args.dataset == 'mutag': dataset = MUTAGDataset() elif args.dataset == 'bgs': dataset = BGSDataset() elif args.dataset == 'am': dataset = AMDataset() elif args.dataset == 'ogbn-mag': dataset = DglNodePropPredDataset(name=args.dataset) ogb_dataset = True else: raise ValueError() if ogb_dataset is True: split_idx = dataset.get_idx_split() train_idx = split_idx["train"]['paper'] val_idx = split_idx["valid"]['paper'] test_idx = split_idx["test"]['paper'] hg_orig, labels = dataset[0] subgs = {} for etype in hg_orig.canonical_etypes: u, v = hg_orig.all_edges(etype=etype) subgs[etype] = (u, v) subgs[(etype[2], 'rev-'+etype[1], etype[0])] = (v, u) hg = dgl.heterograph(subgs) hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat'] labels = labels['paper'].squeeze() num_rels = len(hg.canonical_etypes) num_of_ntype = len(hg.ntypes) num_classes = dataset.num_classes if args.dataset == 'ogbn-mag': category = 'paper' print('Number of relations: {}'.format(num_rels)) print('Number of class: {}'.format(num_classes)) print('Number of train: {}'.format(len(train_idx))) print('Number of valid: {}'.format(len(val_idx))) print('Number of test: {}'.format(len(test_idx))) else: # Load from hetero-graph hg = dataset[0] num_rels = len(hg.canonical_etypes) num_of_ntype = len(hg.ntypes) category = dataset.predict_category num_classes = dataset.num_classes train_mask = hg.nodes[category].data.pop('train_mask') test_mask = hg.nodes[category].data.pop('test_mask') labels = hg.nodes[category].data.pop('labels') train_idx = th.nonzero(train_mask, as_tuple=False).squeeze() test_idx = th.nonzero(test_mask, as_tuple=False).squeeze() # AIFB, MUTAG, BGS and AM datasets do not provide validation set split. # Split train set into train and validation if args.validation is set # otherwise use train set as the validation set. if args.validation: val_idx = train_idx[:len(train_idx) // 5] train_idx = train_idx[len(train_idx) // 5:] else: val_idx = train_idx node_feats = [] for ntype in hg.ntypes: if len(hg.nodes[ntype].data) == 0 or args.node_feats is False: node_feats.append(hg.number_of_nodes(ntype)) else: assert len(hg.nodes[ntype].data) == 1 feat = hg.nodes[ntype].data.pop('feat') node_feats.append(feat.share_memory_()) # get target category id category_id = len(hg.ntypes) for i, ntype in enumerate(hg.ntypes): if ntype == category: category_id = i print('{}:{}'.format(i, ntype)) g = dgl.to_homogeneous(hg) g.ndata['ntype'] = g.ndata[dgl.NTYPE] g.ndata['ntype'].share_memory_() g.edata['etype'] = g.edata[dgl.ETYPE] g.edata['etype'].share_memory_() g.ndata['type_id'] = g.ndata[dgl.NID] g.ndata['type_id'].share_memory_() node_ids = th.arange(g.number_of_nodes()) # find out the target node ids node_tids = g.ndata[dgl.NTYPE] loc = (node_tids == category_id) target_idx = node_ids[loc] target_idx.share_memory_() train_idx.share_memory_() val_idx.share_memory_() test_idx.share_memory_() # Create csr/coo/csc formats before launching training processes with multi-gpu. # This avoids creating certain formats in each sub-process, which saves momory and CPU. g.create_formats_() n_gpus = len(devices) n_cpus = mp.cpu_count() # cpu if devices[0] == -1: run(0, 0, n_cpus, args, ['cpu'], (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx, train_idx, val_idx, test_idx, labels), None, None) # gpu elif n_gpus == 1: run(0, n_gpus, n_cpus, args, devices, (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx, train_idx, val_idx, test_idx, labels), None, None) # multi gpu else: queue = mp.Queue(n_gpus) procs = [] num_train_seeds = train_idx.shape[0] num_valid_seeds = val_idx.shape[0] num_test_seeds = test_idx.shape[0] train_seeds = th.randperm(num_train_seeds) valid_seeds = th.randperm(num_valid_seeds) test_seeds = th.randperm(num_test_seeds) tseeds_per_proc = num_train_seeds // n_gpus vseeds_per_proc = num_valid_seeds // n_gpus tstseeds_per_proc = num_test_seeds // n_gpus for proc_id in range(n_gpus): # we have multi-gpu for training, evaluation and testing # so split trian set, valid set and test set into num-of-gpu parts. proc_train_seeds = train_seeds[proc_id * tseeds_per_proc : (proc_id + 1) * tseeds_per_proc \ if (proc_id + 1) * tseeds_per_proc < num_train_seeds \ else num_train_seeds] proc_valid_seeds = valid_seeds[proc_id * vseeds_per_proc : (proc_id + 1) * vseeds_per_proc \ if (proc_id + 1) * vseeds_per_proc < num_valid_seeds \ else num_valid_seeds] proc_test_seeds = test_seeds[proc_id * tstseeds_per_proc : (proc_id + 1) * tstseeds_per_proc \ if (proc_id + 1) * tstseeds_per_proc < num_test_seeds \ else num_test_seeds] p = mp.Process(target=run, args=(proc_id, n_gpus, n_cpus // n_gpus, args, devices, (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx, train_idx, val_idx, test_idx, labels), (proc_train_seeds, proc_valid_seeds, proc_test_seeds), queue)) p.start() procs.append(p) for p in procs: p.join()
def main(args, devices): # load graph data ogb_dataset = False if args.dataset == 'aifb': dataset = AIFBDataset() elif args.dataset == 'mutag': dataset = MUTAGDataset() elif args.dataset == 'bgs': dataset = BGSDataset() elif args.dataset == 'am': dataset = AMDataset() elif args.dataset == 'ogbn-mag': dataset = DglNodePropPredDataset(name=args.dataset) ogb_dataset = True else: raise ValueError() if ogb_dataset is True: split_idx = dataset.get_idx_split() train_idx = split_idx["train"]['paper'] val_idx = split_idx["valid"]['paper'] test_idx = split_idx["test"]['paper'] hg_orig, labels = dataset[0] subgs = {} for etype in hg_orig.canonical_etypes: u, v = hg_orig.all_edges(etype=etype) subgs[etype] = (u, v) subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u) hg = dgl.heterograph(subgs) hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat'] labels = labels['paper'].squeeze() num_rels = len(hg.canonical_etypes) num_of_ntype = len(hg.ntypes) num_classes = dataset.num_classes if args.dataset == 'ogbn-mag': category = 'paper' print('Number of relations: {}'.format(num_rels)) print('Number of class: {}'.format(num_classes)) print('Number of train: {}'.format(len(train_idx))) print('Number of valid: {}'.format(len(val_idx))) print('Number of test: {}'.format(len(test_idx))) else: # Load from hetero-graph hg = dataset[0] num_rels = len(hg.canonical_etypes) num_of_ntype = len(hg.ntypes) category = dataset.predict_category num_classes = dataset.num_classes train_mask = hg.nodes[category].data.pop('train_mask') test_mask = hg.nodes[category].data.pop('test_mask') labels = hg.nodes[category].data.pop('labels') train_idx = th.nonzero(train_mask, as_tuple=False).squeeze() test_idx = th.nonzero(test_mask, as_tuple=False).squeeze() # AIFB, MUTAG, BGS and AM datasets do not provide validation set split. # Split train set into train and validation if args.validation is set # otherwise use train set as the validation set. if args.validation: val_idx = train_idx[:len(train_idx) // 5] train_idx = train_idx[len(train_idx) // 5:] else: val_idx = train_idx node_feats = [] for ntype in hg.ntypes: if len(hg.nodes[ntype].data) == 0 or args.node_feats is False: node_feats.append(hg.number_of_nodes(ntype)) else: assert len(hg.nodes[ntype].data) == 1 feat = hg.nodes[ntype].data.pop('feat') node_feats.append(feat.share_memory_()) # get target category id category_id = len(hg.ntypes) for i, ntype in enumerate(hg.ntypes): if ntype == category: category_id = i print('{}:{}'.format(i, ntype)) g = dgl.to_homogeneous(hg) g.ndata['ntype'] = g.ndata[dgl.NTYPE] g.ndata['ntype'].share_memory_() g.edata['etype'] = g.edata[dgl.ETYPE] g.edata['etype'].share_memory_() g.ndata['type_id'] = g.ndata[dgl.NID] g.ndata['type_id'].share_memory_() node_ids = th.arange(g.number_of_nodes()) # find out the target node ids node_tids = g.ndata[dgl.NTYPE] loc = (node_tids == category_id) target_idx = node_ids[loc] target_idx.share_memory_() train_idx.share_memory_() val_idx.share_memory_() test_idx.share_memory_() # This is a graph with multiple node types, so we want a way to map # our target node from their global node numberings, back to their # numberings within their type. This is used when taking the nodes in a # mini-batch, and looking up their type-specific labels inv_target = th.empty(node_ids.shape, dtype=node_ids.dtype) inv_target.share_memory_() inv_target[target_idx] = th.arange(0, target_idx.shape[0], dtype=inv_target.dtype) # Create csr/coo/csc formats before launching training processes with multi-gpu. # This avoids creating certain formats in each sub-process, which saves momory and CPU. g.create_formats_() n_gpus = len(devices) n_cpus = mp.cpu_count() # cpu if devices[0] == -1: run(0, 0, n_cpus, args, ['cpu'], (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx, inv_target, train_idx, val_idx, test_idx, labels), None) # gpu elif n_gpus == 1: run(0, n_gpus, n_cpus, args, devices, (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx, inv_target, train_idx, val_idx, test_idx, labels), None) # multi gpu else: queue = mp.Queue(n_gpus) procs = [] for proc_id in range(n_gpus): # We use distributed data parallel dataloader to handle the data # splitting p = mp.Process(target=run, args=(proc_id, n_gpus, n_cpus // n_gpus, args, devices, (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx, inv_target, train_idx, val_idx, test_idx, labels), queue)) p.start() procs.append(p) for p in procs: p.join()