Beispiel #1
0
def trainer(rank, world_size, args, backend='nccl'):
    # init multi process
    init_process(rank, world_size, backend)

    # load data
    dataname = os.path.basename(args.dataset)
    g = dgl.contrib.graph_store.create_graph_from_store(dataname, "shared_mem")
    labels = data.get_labels(args.dataset)
    n_classes = len(np.unique(labels))
    # masks for semi-supervised learning
    train_mask, val_mask, test_mask = data.get_masks(args.dataset)
    train_nid = np.nonzero(train_mask)[0].astype(np.int64)
    chunk_size = int(train_nid.shape[0] / world_size) - 1
    train_nid = train_nid[chunk_size * rank:chunk_size * (rank + 1)]
    test_nid = np.nonzero(test_mask)[0].astype(np.int64)
    # to torch tensor
    labels = torch.LongTensor(labels)
    train_mask = torch.ByteTensor(train_mask)
    val_mask = torch.ByteTensor(val_mask)
    test_mask = torch.ByteTensor(test_mask)

    # prepare model
    num_hops = args.n_layers if args.preprocess else args.n_layers + 1
    model = GCNSampling(args.feat_size, args.n_hidden, n_classes,
                        args.n_layers, F.relu, args.dropout, args.preprocess)
    loss_fcn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    model.cuda(rank)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
    ctx = torch.device(rank)

    # start training
    epoch_dur = []
    if args.remote_sample:
        sampler = SampleLoader(g, rank, one2all=False)
    else:
        sampler = dgl.contrib.sampling.NeighborSampler(
            g,
            args.batch_size,
            args.num_neighbors,
            neighbor_type='in',
            shuffle=True,
            num_workers=args.num_workers,
            num_hops=num_hops,
            seed_nodes=train_nid,
            prefetch=True)
    profile_begin = time.time()
    with torch.autograd.profiler.profile(enabled=(rank == 0),
                                         use_cuda=True) as prof:
        for epoch in range(args.n_epochs):
            model.train()
            epoch_start_time = time.time()
            step = 0
            for nf in sampler:
                with torch.autograd.profiler.record_function('gpu-load'):
                    nf.copy_from_parent(ctx=ctx)
                    batch_nids = nf.layer_parent_nid(-1)
                    label = labels[batch_nids]
                    label = label.cuda(rank, non_blocking=True)
                with torch.autograd.profiler.record_function('gpu-compute'):
                    pred = model(nf)
                    loss = loss_fcn(pred, label)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                step += 1
                if rank == 0 and step % 20 == 0:
                    print('epoch [{}] step [{}]. Loss: {:.4f}'.format(
                        epoch + 1, step, loss.item()))
            if rank == 0:
                epoch_dur.append(time.time() - epoch_start_time)
                print('Epoch average time: {:.4f}'.format(
                    np.mean(np.array(epoch_dur[2:]))))
    print('Total Time: {:.4f}s'.format(time.time() - profile_begin))
    if rank == 0:
        print(prof.key_averages().table(sort_by='cuda_time_total'))
Beispiel #2
0
def trainer(rank, world_size, args, backend='nccl'):
    # init multi process
    init_process(rank, world_size, backend)

    # load data
    dataname = os.path.basename(args.dataset)
    remote_g = dgl.contrib.graph_store.create_graph_from_store(
        dataname, "shared_mem")

    adj, t2fid = data.get_sub_train_graph(args.dataset, rank, world_size)
    g = DGLGraph(adj, readonly=True)
    n_classes = args.n_classes
    train_nid = data.get_sub_train_nid(args.dataset, rank, world_size)
    sub_labels = data.get_sub_train_labels(args.dataset, rank, world_size)
    labels = np.zeros(np.max(train_nid) + 1, dtype=np.int)
    labels[train_nid] = sub_labels

    # to torch tensor
    t2fid = torch.LongTensor(t2fid)
    labels = torch.LongTensor(labels)
    embed_names = ['features', 'norm']
    cacher = storage.GraphCacheServer(remote_g, adj.shape[0], t2fid, rank)
    cacher.init_field(embed_names)
    cacher.log = False

    # prepare model
    num_hops = args.n_layers if args.preprocess else args.n_layers + 1
    model = GCNSampling(args.feat_size, args.n_hidden, n_classes,
                        args.n_layers, F.relu, args.dropout, args.preprocess)
    infer_model = GCNInfer(args.feat_size, args.n_hidden, n_classes,
                           args.n_layers, F.relu)
    loss_fcn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    model.cuda(rank)
    infer_model.cuda(rank)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
    ctx = torch.device(rank)

    # start training
    epoch_dur = []
    sampler = SampleLoader(g, rank, one2all=False)
    tic = time.time()
    with torch.autograd.profiler.profile(enabled=(rank == 0),
                                         use_cuda=True) as prof:
        for epoch in range(args.n_epochs):
            model.train()
            epoch_start_time = time.time()
            step = 0
            for nf in sampler:
                with torch.autograd.profiler.record_function('gpu-load'):
                    cacher.fetch_data(nf)
                    batch_nids = nf.layer_parent_nid(-1)
                    label = labels[batch_nids]
                    label = label.cuda(rank, non_blocking=True)
                with torch.autograd.profiler.record_function('gpu-compute'):
                    pred = model(nf)
                    loss = loss_fcn(pred, label)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                step += 1
                if epoch == 0 and step == 1:
                    cacher.auto_cache(g, embed_names)
                if rank == 0 and step % 20 == 0:
                    print('epoch [{}] step [{}]. Loss: {:.4f}'.format(
                        epoch + 1, step, loss.item()))
            if rank == 0:
                epoch_dur.append(time.time() - epoch_start_time)
                print('Epoch average time: {:.4f}'.format(
                    np.mean(np.array(epoch_dur[2:]))))
            if cacher.log:
                miss_rate = cacher.get_miss_rate()
                print('Epoch average miss rate: {:.4f}'.format(miss_rate))
        toc = time.time()
    if rank == 0:
        print(prof.key_averages().table(sort_by='cuda_time_total'))
    print('Total Time: {:.4f}s'.format(toc - tic))