Esempio n. 1
0
def ssgd_with_horovod(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, nwpernode, pretrain, num_steps = 1):
    rank = hvd.rank()
    torch.cuda.set_device(rank%nwpernode)
    if rank != 0:
        pretrain = None
    trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))

    optimizer = hvd.DistributedOptimizer(trainer.optimizer, named_parameters=trainer.net.named_parameters())
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update)

    times = []
    display = 20 if iters_per_epoch > 20 else iters_per_epoch-1
    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            trainer.update_model()
            times.append(time.time()-s)
            if i % display == 0 and i > 0: 
                time_per_iter = np.mean(times)
                logger.info('Time per iteration including communication: %f. Speed: %f images/s', time_per_iter, batch_size * nsteps_update / time_per_iter)
                times = []
Esempio n. 2
0
def ssgd(dnn,
         dataset,
         data_dir,
         nworkers,
         lr,
         batch_size,
         nsteps_update,
         max_epochs,
         nwpernode,
         pretrain,
         num_steps,
         compressor,
         density,
         threshold,
         gradient_path=None):
    rank = hvd.rank()
    torch.cuda.set_device(rank % nwpernode)
    if rank != 0:
        pretrain = None
    trainer = DLTrainer(rank,
                        nworkers,
                        dist=False,
                        batch_size=batch_size,
                        is_weak_scaling=True,
                        ngpus=1,
                        data_dir=data_dir,
                        dataset=dataset,
                        dnn=dnn,
                        lr=lr,
                        nworkers=nworkers,
                        prefix='allreduce',
                        pretrain=pretrain,
                        num_steps=num_steps,
                        tb_writer=writer)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))
    is_sparse = density < 1
    #if not is_sparse:
    #    compressor = None

    if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE:
        seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer)
        layerwise_times = comm.bcast(layerwise_times, root=0)
        if rank == 0:
            logger.info('layerwise backward times: %s', list(layerwise_times))
            logger.info('layerwise backward sizes: %s', list(layerwise_sizes))
        logger.info('Bencharmked backward time: %f', np.sum(layerwise_times))
        logger.info('Model size: %d', np.sum(layerwise_sizes))
    else:
        seq_layernames, layerwise_times, layerwise_sizes = None, None, None

    norm_clip = None
    if dnn == 'lstm':
        norm_clip = 0.25
    elif dnn == 'lstman4':
        norm_clip = 400

    optimizer = hvd.DistributedOptimizer(
        trainer.optimizer,
        named_parameters=trainer.net.named_parameters(),
        compression=compressors[compressor],
        is_sparse=is_sparse,
        density=density,
        seq_layernames=seq_layernames,
        layerwise_times=layerwise_times,
        norm_clip=norm_clip,
        threshold=threshold,
        writer=writer,
        gradient_path=gradient_path)
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    iters_per_epoch = trainer.get_num_of_training_samples() // (
        nworkers * batch_size * nsteps_update)

    times = []
    logger.info('max_epochs: %d', max_epochs)
    display = 40 if iters_per_epoch > 40 else iters_per_epoch - 1
    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            trainer.update_model()
            times.append(time.time() - s)
            if i % display == 0 and i > 0:
                time_per_iter = np.mean(times)
                logger.warn(
                    'Time per iteration including communication: %f, Speed: %f images/s',
                    time_per_iter, batch_size * nsteps_update / time_per_iter)
                times = []
        optimizer.increase_one_epoch()
Esempio n. 3
0
def robust_ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, compression=False, compressor='topk', nwpernode=1, sigma_scale=2.5, pretrain=None, density=0.01, prefix=None):
    global relative_path

    torch.cuda.set_device(dopt.rank()%nwpernode)
    rank = dopt.rank()
    if rank != 0:
        pretrain = None

    trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix=prefix+'-ds%s'%str(density), pretrain=pretrain, tb_writer=writer)

    init_epoch = trainer.get_train_epoch()
    init_iter = trainer.get_train_iter()

    trainer.set_train_epoch(comm.bcast(init_epoch))
    trainer.set_train_iter(comm.bcast(init_iter))

    def _error_handler(new_num_workers, new_rank):
        logger.info('Error info catched by trainer')
        trainer.update_nworker(new_num_workers, new_rank)

    compressor = compressor if compression else 'none'
    compressor = compressors[compressor]
    is_sparse = compression

    logger.info('Broadcast parameters....')
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    logger.info('Broadcast parameters finished....')

    norm_clip = None
    optimizer = dopt.DistributedOptimizer(trainer.optimizer, trainer.net.named_parameters(), compression=compressor, is_sparse=is_sparse, err_handler=_error_handler, layerwise_times=None, sigma_scale=sigma_scale, density=density, norm_clip=norm_clip, writer=writer)

    trainer.update_optimizer(optimizer)

    iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update)

    times = []
    NUM_OF_DISLAY = 40
    display = NUM_OF_DISLAY if iters_per_epoch > NUM_OF_DISLAY else iters_per_epoch-1
    logger.info('Start training ....')
    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            trainer.update_model()
            times.append(time.time()-s)
            if i % display == 0 and i > 0: 
                time_per_iter = np.mean(times)
                logger.info('Time per iteration including communication: %f. Speed: %f images/s, current density: %f', time_per_iter, batch_size * nsteps_update / time_per_iter, optimizer.get_current_density())
                times = []
        optimizer.add_train_epoch()
        if settings.PROFILING_INDEX and rank == 0 and epoch % 10 == 0:
            fn = os.path.join(relative_path, 'index-rank%d-epoch%d.npy' % (rank, epoch))
            key = list(optimizer._allreducer._update_index_counter.keys())[0]
            np.save(fn, optimizer._allreducer._update_index_counter[key].int().cpu().numpy())

        if settings.PROFILING_NORM:
            # For comparison purpose ===>
            fn = os.path.join(relative_path, 'gtopknorm-rank%d-epoch%d.npy' % (rank, epoch))
            fn2 = os.path.join(relative_path, 'randknorm-rank%d-epoch%d.npy' % (rank, epoch))
            fn3 = os.path.join(relative_path, 'upbound-rank%d-epoch%d.npy' % (rank, epoch))
            fn5 = os.path.join(relative_path, 'densestd-rank%d-epoch%d.npy' % (rank, epoch))
            arr = [] 
            arr2 = [] 
            arr3 = [] 
            arr4 = [] 
            arr5 = [] 
            for gtopk_norm, randk_norm, upbound, xnorm, dense_std in optimizer._allreducer._profiling_norms:
                arr.append(gtopk_norm)
                arr2.append(randk_norm)
                arr3.append(upbound)
                arr4.append(xnorm)
                arr5.append(dense_std)
            arr = np.array(arr)
            arr2 = np.array(arr2)
            arr3 = np.array(arr3)
            arr4 = np.array(arr4)
            arr5 = np.array(arr5)
            logger.info('[rank:%d][%d] gtopk norm mean: %f, std: %f', rank, epoch, np.mean(arr), np.std(arr))
            logger.info('[rank:%d][%d] randk norm mean: %f, std: %f', rank, epoch, np.mean(arr2), np.std(arr2))
            logger.info('[rank:%d][%d] upbound norm mean: %f, std: %f', rank, epoch, np.mean(arr3), np.std(arr3))
            logger.info('[rank:%d][%d] x norm mean: %f, std: %f', rank, epoch, np.mean(arr4), np.std(arr4))
            logger.info('[rank:%d][%d] dense std mean: %f, std: %f', rank, epoch, np.mean(arr5), np.std(arr5))
            np.save(fn, arr)
            np.save(fn2, arr2)
            np.save(fn3, arr3)
            np.save(fn5, arr5)
            # For comparison purpose <=== End
        optimizer._allreducer._profiling_norms = []
    optimizer.stop()
Esempio n. 4
0
def ssgd(dnn,
         dataset,
         data_dir,
         nworkers,
         lr,
         batch_size,
         nsteps_update,
         max_epochs,
         nwpernode,
         pretrain,
         num_steps,
         compressor,
         density,
         threshold,
         gradient_path=None,
         tb=None,
         iratio=0.1,
         stages=1,
         partitions=0,
         ec_gradw=1.0,
         ec_memw=0.0,
         optimizer='nesterov',
         totaltime=0):
    global SPEED
    if not settings.USE_CPU:
        if nworkers > 1:
            rank = hvd.rank()
            torch.cuda.set_device(hvd.local_rank())  #%rank%nwpernode)
        else:
            rank = 0
            torch.cuda.set_device(rank)
    if rank != 0:
        pretrain = None

    #### CHECK whether to use GPU or CPU
    if settings.USE_CPU:
        trainer = DLTrainer(rank,
                            nworkers,
                            dist=False,
                            batch_size=batch_size,
                            is_weak_scaling=True,
                            ngpus=0,
                            data_dir=data_dir,
                            dataset=dataset,
                            dnn=dnn,
                            lr=lr,
                            nworkers=nworkers,
                            prefix='allreduce',
                            pretrain=pretrain,
                            num_steps=num_steps,
                            tb_writer=writer,
                            tb=tb,
                            optimizer_str=optimizer)
    else:
        trainer = DLTrainer(rank,
                            nworkers,
                            dist=False,
                            batch_size=batch_size,
                            is_weak_scaling=True,
                            ngpus=1,
                            data_dir=data_dir,
                            dataset=dataset,
                            dnn=dnn,
                            lr=lr,
                            nworkers=nworkers,
                            prefix='allreduce',
                            pretrain=pretrain,
                            num_steps=num_steps,
                            tb_writer=writer,
                            tb=tb,
                            optimizer_str=optimizer)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))
    is_sparse = True  #density < 1
    if not is_sparse:
        compressor = None

    if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE:
        seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer)
        layerwise_times = comm.bcast(layerwise_times, root=0)
        if rank == 0:
            logger.info('layerwise backward times: %s', list(layerwise_times))
            logger.info('layerwise backward sizes: %s', list(layerwise_sizes))
        logger.info('Bencharmked backward time: %f', np.sum(layerwise_times))
        logger.info('Model size: %d', np.sum(layerwise_sizes))
    else:
        seq_layernames, layerwise_times, layerwise_sizes = None, None, None

    norm_clip = None
    if dnn == 'lstm':
        norm_clip = 0.25
    elif dnn == 'lstman4':
        norm_clip = 400

    optimizer = hvd.DistributedOptimizer(
        trainer.optimizer,
        named_parameters=trainer.net.named_parameters(),
        compression=compressors[compressor],
        is_sparse=is_sparse,
        density=density,
        seq_layernames=seq_layernames,
        layerwise_times=layerwise_times,
        norm_clip=norm_clip,
        threshold=threshold,
        writer=writer,
        gradient_path=gradient_path,
        tb=tb,
        iratio=iratio,
        stages=stages,
        partitions=partitions,
        ec_gradw=ec_gradw,
        ec_memw=ec_memw)
    hvd.SPEED
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    iters_per_epoch = trainer.get_num_of_training_samples() // (
        nworkers * batch_size * nsteps_update)

    start = time.time()
    times = []
    noupdate_times = []
    logger.info('max_epochs: %d', max_epochs)
    display = settings.DISPLAY if iters_per_epoch > settings.DISPLAY else iters_per_epoch - 1

    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            noupdate_times.append(time.time() - s)

            trainer.update_model()
            torch.cuda.synchronize()
            times.append(time.time() - s)

            if i % display == 0 and i > 0:
                time_per_iter = np.mean(times)
                update_per_iter = time_per_iter - np.mean(noupdate_times)
                throughput = batch_size * nsteps_update / time_per_iter
                trainer.log_info(time_per_iter, throughput, update_per_iter)
                logger.warning(
                    'Time per iteration: %f, communication: %f, Speed: %f images/s',
                    time_per_iter, update_per_iter, throughput)
                times = []
                noupdate_times = []

        optimizer.increase_one_epoch()

        if totaltime > 0 and time.time() - start > totaltime:
            trainer.test(trainer.get_train_epoch())
            break
    if not (dataset == 'cifar10'):
        trainer.test(trainer.get_train_epoch())