def evaluate(standalone, model, emb_layer, g, labels, val_nid, test_nid, batch_size, device): """ Evaluate the model on the validation set specified by ``val_nid``. g : The entire graph. inputs : The features of all the nodes. labels : The labels of all the nodes. val_nid : the node Ids for validation. batch_size : Number of nodes to compute at the same time. device : The GPU device to evaluate on. """ model.eval() emb_layer.eval() with th.no_grad(): inputs = load_embs(standalone, emb_layer, g) pred = model.inference(standalone, g, inputs, batch_size, device) model.train() emb_layer.train() return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid])
def run(args, device, data): # Unpack data train_nid, val_nid, test_nid, n_classes, g = data # Create sampler sampler = NeighborSampler(g, [int(fanout) for fanout in args.fan_out.split(',')], dgl.distributed.sample_neighbors, device, load_feat=False) # Create DataLoader for constructing blocks dataloader = DistDataLoader( dataset=train_nid.numpy(), batch_size=args.batch_size, collate_fn=sampler.sample_blocks, shuffle=True, drop_last=False) # Define model and optimizer emb_layer = DistEmb(g.num_nodes(), args.num_hidden, dgl_sparse_emb=args.dgl_sparse, dev_id=device) model = TransDistSAGE(args.num_hidden, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout) model = model.to(device) if not args.standalone: if args.num_gpus == -1: model = th.nn.parallel.DistributedDataParallel(model) else: dev_id = g.rank() % args.num_gpus model = th.nn.parallel.DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) if not args.dgl_sparse: emb_layer = th.nn.parallel.DistributedDataParallel(emb_layer) loss_fcn = nn.CrossEntropyLoss() loss_fcn = loss_fcn.to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) if args.dgl_sparse: emb_optimizer = dgl.distributed.optim.SparseAdam([emb_layer.sparse_emb], lr=args.sparse_lr) print('optimize DGL sparse embedding:', emb_layer.sparse_emb) elif args.standalone: emb_optimizer = th.optim.SparseAdam(list(emb_layer.sparse_emb.parameters()), lr=args.sparse_lr) print('optimize Pytorch sparse embedding:', emb_layer.sparse_emb) else: emb_optimizer = th.optim.SparseAdam(list(emb_layer.module.sparse_emb.parameters()), lr=args.sparse_lr) print('optimize Pytorch sparse embedding:', emb_layer.module.sparse_emb) train_size = th.sum(g.ndata['train_mask'][0:g.number_of_nodes()]) # Training loop iter_tput = [] epoch = 0 for epoch in range(args.num_epochs): tic = time.time() sample_time = 0 forward_time = 0 backward_time = 0 update_time = 0 num_seeds = 0 num_inputs = 0 start = time.time() # Loop over the dataloader to sample the computation dependency graph as a list of # blocks. step_time = [] for step, blocks in enumerate(dataloader): tic_step = time.time() sample_time += tic_step - start # The nodes for input lies at the LHS side of the first block. # The nodes for output lies at the RHS side of the last block. batch_inputs = blocks[0].srcdata[dgl.NID] batch_labels = blocks[-1].dstdata['labels'] batch_labels = batch_labels.long() num_seeds += len(blocks[-1].dstdata[dgl.NID]) num_inputs += len(blocks[0].srcdata[dgl.NID]) blocks = [block.to(device) for block in blocks] batch_labels = batch_labels.to(device) # Compute loss and prediction start = time.time() batch_inputs = emb_layer(batch_inputs) batch_pred = model(blocks, batch_inputs) loss = loss_fcn(batch_pred, batch_labels) forward_end = time.time() emb_optimizer.zero_grad() optimizer.zero_grad() loss.backward() compute_end = time.time() forward_time += forward_end - start backward_time += compute_end - forward_end emb_optimizer.step() optimizer.step() update_time += time.time() - compute_end step_t = time.time() - tic_step step_time.append(step_t) iter_tput.append(len(blocks[-1].dstdata[dgl.NID]) / step_t) if step % args.log_every == 0: acc = compute_acc(batch_pred, batch_labels) gpu_mem_alloc = th.cuda.max_memory_allocated() / 1000000 if th.cuda.is_available() else 0 print('Part {} | Epoch {:05d} | Step {:05d} | Loss {:.4f} | Train Acc {:.4f} | Speed (samples/sec) {:.4f} | GPU {:.1f} MB | time {:.3f} s'.format( g.rank(), epoch, step, loss.item(), acc.item(), np.mean(iter_tput[3:]), gpu_mem_alloc, np.sum(step_time[-args.log_every:]))) start = time.time() toc = time.time() print('Part {}, Epoch Time(s): {:.4f}, sample+data_copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #seeds: {}, #inputs: {}'.format( g.rank(), toc - tic, sample_time, forward_time, backward_time, update_time, num_seeds, num_inputs)) epoch += 1 if epoch % args.eval_every == 0 and epoch != 0: start = time.time() val_acc, test_acc = evaluate(args.standalone, model.module, emb_layer, g, g.ndata['labels'], val_nid, test_nid, args.batch_size_eval, device) print('Part {}, Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}'.format(g.rank(), val_acc, test_acc, time.time()-start))