Beispiel #1
0
def train_with_single(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, num_steps=1):
    torch.cuda.set_device(0)
    trainer = DLTrainer(0, nworkers, dist=False, batch_size=batch_size, 
        is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, 
        dnn=dnn, lr=lr, nworkers=nworkers, prefix='singlegpu', num_steps = num_steps)
    iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update)
    seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer)
    logger.info('Bencharmked backward time: %f', np.sum(layerwise_times))
    logger.info('Model size: %d', np.sum(layerwise_sizes))

    times = []
    display = 100 if iters_per_epoch > 100 else iters_per_epoch-1
    for epoch in range(max_epochs):
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            trainer.optimizer.zero_grad()
            for j in range(nsteps_update):
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            trainer.update_model()
            times.append(time.time()-s)
            if i % display == 0 and i > 0: 
                time_per_iter = np.mean(times)
                logger.info('Time per iteration including communication: %f. Speed: %f images/s', time_per_iter, batch_size * nsteps_update / time_per_iter)
                times = []
Beispiel #2
0
def ssgd(dnn,
         dataset,
         data_dir,
         nworkers,
         lr,
         batch_size,
         nsteps_update,
         max_epochs,
         nwpernode,
         pretrain,
         num_steps,
         compressor,
         density,
         threshold,
         gradient_path=None):
    rank = hvd.rank()
    torch.cuda.set_device(rank % nwpernode)
    if rank != 0:
        pretrain = None
    trainer = DLTrainer(rank,
                        nworkers,
                        dist=False,
                        batch_size=batch_size,
                        is_weak_scaling=True,
                        ngpus=1,
                        data_dir=data_dir,
                        dataset=dataset,
                        dnn=dnn,
                        lr=lr,
                        nworkers=nworkers,
                        prefix='allreduce',
                        pretrain=pretrain,
                        num_steps=num_steps,
                        tb_writer=writer)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))
    is_sparse = density < 1
    #if not is_sparse:
    #    compressor = None

    if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE:
        seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer)
        layerwise_times = comm.bcast(layerwise_times, root=0)
        if rank == 0:
            logger.info('layerwise backward times: %s', list(layerwise_times))
            logger.info('layerwise backward sizes: %s', list(layerwise_sizes))
        logger.info('Bencharmked backward time: %f', np.sum(layerwise_times))
        logger.info('Model size: %d', np.sum(layerwise_sizes))
    else:
        seq_layernames, layerwise_times, layerwise_sizes = None, None, None

    norm_clip = None
    if dnn == 'lstm':
        norm_clip = 0.25
    elif dnn == 'lstman4':
        norm_clip = 400

    optimizer = hvd.DistributedOptimizer(
        trainer.optimizer,
        named_parameters=trainer.net.named_parameters(),
        compression=compressors[compressor],
        is_sparse=is_sparse,
        density=density,
        seq_layernames=seq_layernames,
        layerwise_times=layerwise_times,
        norm_clip=norm_clip,
        threshold=threshold,
        writer=writer,
        gradient_path=gradient_path)
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    iters_per_epoch = trainer.get_num_of_training_samples() // (
        nworkers * batch_size * nsteps_update)

    times = []
    logger.info('max_epochs: %d', max_epochs)
    display = 40 if iters_per_epoch > 40 else iters_per_epoch - 1
    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            trainer.update_model()
            times.append(time.time() - s)
            if i % display == 0 and i > 0:
                time_per_iter = np.mean(times)
                logger.warn(
                    'Time per iteration including communication: %f, Speed: %f images/s',
                    time_per_iter, batch_size * nsteps_update / time_per_iter)
                times = []
        optimizer.increase_one_epoch()
    # If using GPU Adasum allreduce, scale learning rate by local_size.
    if args.use_adasum and hvd.nccl_built():
        lr_scaler = hvd.local_size()

optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler)

# Set up fixed fake data
data = torch.randn(args.batch_size, 3, 224, 224)
target = torch.LongTensor(args.batch_size).random_() % 1000
if args.cuda:
    data, target = data.cuda(), target.cuda()



if args.mgwfbp:
    seq_layernames, layerwise_times, _ = benchmark(model, (data, target), F.cross_entropy, task='imagenet')
    layerwise_times = comm.bcast(layerwise_times, root=0)
else:
    seq_layernames, layerwise_times = None, None

optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compressors[args.compressor](), is_sparse=args.density<1, density=args.density, seq_layernames=seq_layernames, layerwise_times=layerwise_times, norm_clip=None, threshold=args.threshold, writer=None, gradient_path='./', momentum_correction=False, fp16=args.fp16, mgwfbp=args.mgwfbp, rdma=args.rdma)


# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)



def benchmark_step():
    optimizer.zero_grad()
Beispiel #4
0
def ssgd(dnn,
         dataset,
         data_dir,
         nworkers,
         lr,
         batch_size,
         nsteps_update,
         max_epochs,
         nwpernode,
         pretrain,
         num_steps,
         compressor,
         density,
         threshold,
         gradient_path=None,
         tb=None,
         iratio=0.1,
         stages=1,
         partitions=0,
         ec_gradw=1.0,
         ec_memw=0.0,
         optimizer='nesterov',
         totaltime=0):
    global SPEED
    if not settings.USE_CPU:
        if nworkers > 1:
            rank = hvd.rank()
            torch.cuda.set_device(hvd.local_rank())  #%rank%nwpernode)
        else:
            rank = 0
            torch.cuda.set_device(rank)
    if rank != 0:
        pretrain = None

    #### CHECK whether to use GPU or CPU
    if settings.USE_CPU:
        trainer = DLTrainer(rank,
                            nworkers,
                            dist=False,
                            batch_size=batch_size,
                            is_weak_scaling=True,
                            ngpus=0,
                            data_dir=data_dir,
                            dataset=dataset,
                            dnn=dnn,
                            lr=lr,
                            nworkers=nworkers,
                            prefix='allreduce',
                            pretrain=pretrain,
                            num_steps=num_steps,
                            tb_writer=writer,
                            tb=tb,
                            optimizer_str=optimizer)
    else:
        trainer = DLTrainer(rank,
                            nworkers,
                            dist=False,
                            batch_size=batch_size,
                            is_weak_scaling=True,
                            ngpus=1,
                            data_dir=data_dir,
                            dataset=dataset,
                            dnn=dnn,
                            lr=lr,
                            nworkers=nworkers,
                            prefix='allreduce',
                            pretrain=pretrain,
                            num_steps=num_steps,
                            tb_writer=writer,
                            tb=tb,
                            optimizer_str=optimizer)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))
    is_sparse = True  #density < 1
    if not is_sparse:
        compressor = None

    if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE:
        seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer)
        layerwise_times = comm.bcast(layerwise_times, root=0)
        if rank == 0:
            logger.info('layerwise backward times: %s', list(layerwise_times))
            logger.info('layerwise backward sizes: %s', list(layerwise_sizes))
        logger.info('Bencharmked backward time: %f', np.sum(layerwise_times))
        logger.info('Model size: %d', np.sum(layerwise_sizes))
    else:
        seq_layernames, layerwise_times, layerwise_sizes = None, None, None

    norm_clip = None
    if dnn == 'lstm':
        norm_clip = 0.25
    elif dnn == 'lstman4':
        norm_clip = 400

    optimizer = hvd.DistributedOptimizer(
        trainer.optimizer,
        named_parameters=trainer.net.named_parameters(),
        compression=compressors[compressor],
        is_sparse=is_sparse,
        density=density,
        seq_layernames=seq_layernames,
        layerwise_times=layerwise_times,
        norm_clip=norm_clip,
        threshold=threshold,
        writer=writer,
        gradient_path=gradient_path,
        tb=tb,
        iratio=iratio,
        stages=stages,
        partitions=partitions,
        ec_gradw=ec_gradw,
        ec_memw=ec_memw)
    hvd.SPEED
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    iters_per_epoch = trainer.get_num_of_training_samples() // (
        nworkers * batch_size * nsteps_update)

    start = time.time()
    times = []
    noupdate_times = []
    logger.info('max_epochs: %d', max_epochs)
    display = settings.DISPLAY if iters_per_epoch > settings.DISPLAY else iters_per_epoch - 1

    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            noupdate_times.append(time.time() - s)

            trainer.update_model()
            torch.cuda.synchronize()
            times.append(time.time() - s)

            if i % display == 0 and i > 0:
                time_per_iter = np.mean(times)
                update_per_iter = time_per_iter - np.mean(noupdate_times)
                throughput = batch_size * nsteps_update / time_per_iter
                trainer.log_info(time_per_iter, throughput, update_per_iter)
                logger.warning(
                    'Time per iteration: %f, communication: %f, Speed: %f images/s',
                    time_per_iter, update_per_iter, throughput)
                times = []
                noupdate_times = []

        optimizer.increase_one_epoch()

        if totaltime > 0 and time.time() - start > totaltime:
            trainer.test(trainer.get_train_epoch())
            break
    if not (dataset == 'cifar10'):
        trainer.test(trainer.get_train_epoch())
class BertPretrainingCriterion(torch.nn.Module):
    def __init__(self, vocab_size):
        super(BertPretrainingCriterion, self).__init__()
        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
        self.vocab_size = vocab_size

    def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels):
        masked_lm_loss = self.loss_fn(prediction_scores.view(-1, self.vocab_size), masked_lm_labels.view(-1))
        next_sentence_loss = self.loss_fn(seq_relationship_score.view(-1, 2), next_sentence_labels.view(-1))
        total_loss = masked_lm_loss + next_sentence_loss
        return total_loss
criterion = BertPretrainingCriterion(vocab_size)

if args.mgwfbp:
    seq_layernames, layerwise_times, _ = benchmark(model, batch, criterion, task='bert')
    layerwise_times = comm.bcast(layerwise_times, root=0)
else:
    seq_layernames, layerwise_times = None, None



optimizer = AdamW(model.parameters(),
        lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
        )
#optimizer = optim.SGD(model.parameters(), lr=2e-5)

#compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
# Horovod: wrap optimizer with DistributedOptimizer.
#optimizer = hvd.DistributedOptimizer(optimizer,