def trainer(rank, world_size, args, backend='nccl'): # init multi process init_process(rank, world_size, backend) # load data dataname = os.path.basename(args.dataset) g = dgl.contrib.graph_store.create_graph_from_store(dataname, "shared_mem") labels = data.get_labels(args.dataset) n_classes = len(np.unique(labels)) # masks for semi-supervised learning train_mask, val_mask, test_mask = data.get_masks(args.dataset) train_nid = np.nonzero(train_mask)[0].astype(np.int64) chunk_size = int(train_nid.shape[0] / world_size) - 1 train_nid = train_nid[chunk_size * rank:chunk_size * (rank + 1)] test_nid = np.nonzero(test_mask)[0].astype(np.int64) # to torch tensor labels = torch.LongTensor(labels) train_mask = torch.ByteTensor(train_mask) val_mask = torch.ByteTensor(val_mask) test_mask = torch.ByteTensor(test_mask) # prepare model num_hops = args.n_layers if args.preprocess else args.n_layers + 1 model = GCNSampling(args.feat_size, args.n_hidden, n_classes, args.n_layers, F.relu, args.dropout, args.preprocess) loss_fcn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) model.cuda(rank) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank]) ctx = torch.device(rank) # start training epoch_dur = [] if args.remote_sample: sampler = SampleLoader(g, rank, one2all=False) else: sampler = dgl.contrib.sampling.NeighborSampler( g, args.batch_size, args.num_neighbors, neighbor_type='in', shuffle=True, num_workers=args.num_workers, num_hops=num_hops, seed_nodes=train_nid, prefetch=True) profile_begin = time.time() with torch.autograd.profiler.profile(enabled=(rank == 0), use_cuda=True) as prof: for epoch in range(args.n_epochs): model.train() epoch_start_time = time.time() step = 0 for nf in sampler: with torch.autograd.profiler.record_function('gpu-load'): nf.copy_from_parent(ctx=ctx) batch_nids = nf.layer_parent_nid(-1) label = labels[batch_nids] label = label.cuda(rank, non_blocking=True) with torch.autograd.profiler.record_function('gpu-compute'): pred = model(nf) loss = loss_fcn(pred, label) optimizer.zero_grad() loss.backward() optimizer.step() step += 1 if rank == 0 and step % 20 == 0: print('epoch [{}] step [{}]. Loss: {:.4f}'.format( epoch + 1, step, loss.item())) if rank == 0: epoch_dur.append(time.time() - epoch_start_time) print('Epoch average time: {:.4f}'.format( np.mean(np.array(epoch_dur[2:])))) print('Total Time: {:.4f}s'.format(time.time() - profile_begin)) if rank == 0: print(prof.key_averages().table(sort_by='cuda_time_total'))
def trainer(rank, world_size, args, backend='nccl'): # init multi process init_process(rank, world_size, backend) # load data dataname = os.path.basename(args.dataset) remote_g = dgl.contrib.graph_store.create_graph_from_store( dataname, "shared_mem") adj, t2fid = data.get_sub_train_graph(args.dataset, rank, world_size) g = DGLGraph(adj, readonly=True) n_classes = args.n_classes train_nid = data.get_sub_train_nid(args.dataset, rank, world_size) sub_labels = data.get_sub_train_labels(args.dataset, rank, world_size) labels = np.zeros(np.max(train_nid) + 1, dtype=np.int) labels[train_nid] = sub_labels # to torch tensor t2fid = torch.LongTensor(t2fid) labels = torch.LongTensor(labels) embed_names = ['features', 'norm'] cacher = storage.GraphCacheServer(remote_g, adj.shape[0], t2fid, rank) cacher.init_field(embed_names) cacher.log = False # prepare model num_hops = args.n_layers if args.preprocess else args.n_layers + 1 model = GCNSampling(args.feat_size, args.n_hidden, n_classes, args.n_layers, F.relu, args.dropout, args.preprocess) infer_model = GCNInfer(args.feat_size, args.n_hidden, n_classes, args.n_layers, F.relu) loss_fcn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) model.cuda(rank) infer_model.cuda(rank) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank]) ctx = torch.device(rank) # start training epoch_dur = [] sampler = SampleLoader(g, rank, one2all=False) tic = time.time() with torch.autograd.profiler.profile(enabled=(rank == 0), use_cuda=True) as prof: for epoch in range(args.n_epochs): model.train() epoch_start_time = time.time() step = 0 for nf in sampler: with torch.autograd.profiler.record_function('gpu-load'): cacher.fetch_data(nf) batch_nids = nf.layer_parent_nid(-1) label = labels[batch_nids] label = label.cuda(rank, non_blocking=True) with torch.autograd.profiler.record_function('gpu-compute'): pred = model(nf) loss = loss_fcn(pred, label) optimizer.zero_grad() loss.backward() optimizer.step() step += 1 if epoch == 0 and step == 1: cacher.auto_cache(g, embed_names) if rank == 0 and step % 20 == 0: print('epoch [{}] step [{}]. Loss: {:.4f}'.format( epoch + 1, step, loss.item())) if rank == 0: epoch_dur.append(time.time() - epoch_start_time) print('Epoch average time: {:.4f}'.format( np.mean(np.array(epoch_dur[2:])))) if cacher.log: miss_rate = cacher.get_miss_rate() print('Epoch average miss rate: {:.4f}'.format(miss_rate)) toc = time.time() if rank == 0: print(prof.key_averages().table(sort_by='cuda_time_total')) print('Total Time: {:.4f}s'.format(toc - tic))