Esempio n. 1
0
def ssgd(dnn,
         dataset,
         data_dir,
         nworkers,
         lr,
         batch_size,
         nsteps_update,
         max_epochs,
         nwpernode,
         pretrain,
         num_steps,
         compressor,
         density,
         threshold,
         gradient_path=None):
    rank = hvd.rank()
    torch.cuda.set_device(rank % nwpernode)
    if rank != 0:
        pretrain = None
    trainer = DLTrainer(rank,
                        nworkers,
                        dist=False,
                        batch_size=batch_size,
                        is_weak_scaling=True,
                        ngpus=1,
                        data_dir=data_dir,
                        dataset=dataset,
                        dnn=dnn,
                        lr=lr,
                        nworkers=nworkers,
                        prefix='allreduce',
                        pretrain=pretrain,
                        num_steps=num_steps,
                        tb_writer=writer)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))
    is_sparse = density < 1
    #if not is_sparse:
    #    compressor = None

    if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE:
        seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer)
        layerwise_times = comm.bcast(layerwise_times, root=0)
        if rank == 0:
            logger.info('layerwise backward times: %s', list(layerwise_times))
            logger.info('layerwise backward sizes: %s', list(layerwise_sizes))
        logger.info('Bencharmked backward time: %f', np.sum(layerwise_times))
        logger.info('Model size: %d', np.sum(layerwise_sizes))
    else:
        seq_layernames, layerwise_times, layerwise_sizes = None, None, None

    norm_clip = None
    if dnn == 'lstm':
        norm_clip = 0.25
    elif dnn == 'lstman4':
        norm_clip = 400

    optimizer = hvd.DistributedOptimizer(
        trainer.optimizer,
        named_parameters=trainer.net.named_parameters(),
        compression=compressors[compressor],
        is_sparse=is_sparse,
        density=density,
        seq_layernames=seq_layernames,
        layerwise_times=layerwise_times,
        norm_clip=norm_clip,
        threshold=threshold,
        writer=writer,
        gradient_path=gradient_path)
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    iters_per_epoch = trainer.get_num_of_training_samples() // (
        nworkers * batch_size * nsteps_update)

    times = []
    logger.info('max_epochs: %d', max_epochs)
    display = 40 if iters_per_epoch > 40 else iters_per_epoch - 1
    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            trainer.update_model()
            times.append(time.time() - s)
            if i % display == 0 and i > 0:
                time_per_iter = np.mean(times)
                logger.warn(
                    'Time per iteration including communication: %f, Speed: %f images/s',
                    time_per_iter, batch_size * nsteps_update / time_per_iter)
                times = []
        optimizer.increase_one_epoch()
Esempio n. 2
0
    batch_size = args.batch_size * args.nsteps_update
    prefix = settings.PREFIX
    if args.density < 1:
        prefix = 'comp-' + args.compressor + '-' + prefix
    logdir = 'allreduce-%s-thres-%dkbytes/%s-n%d-bs%d-lr%.4f-ns%d-ds%s' % (
        prefix, args.threshold / 1024, args.dnn, args.nworkers, batch_size,
        args.lr, args.nsteps_update, str(args.density))
    relative_path = './logs/%s' % logdir
    gradient_relative_path = None
    utils.create_path(relative_path)
    if settings.LOGGING_GRADIENTS:
        gradient_relative_path = '%s/gradients/%s' % (args.saved_dir, logdir)
        utils.create_path(gradient_relative_path)
    rank = 0
    if args.nworkers > 1:
        hvd.init()
        rank = hvd.rank()
    if rank == 0:
        tb_runs = './runs/%s' % logdir
        writer = None  #SummaryWriter(tb_runs)
    logfile = os.path.join(relative_path,
                           settings.hostname + '-' + str(rank) + '.log')
    hdlr = logging.FileHandler(logfile)
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.info('Configurations: %s', args)
    ssgd(args.dnn, args.dataset, args.data_dir, args.nworkers, args.lr,
         args.batch_size, args.nsteps_update, args.max_epochs, args.nwpernode,
         args.pretrain, args.num_steps, args.compressor, args.density,
         args.threshold, gradient_relative_path)
Esempio n. 3
0
def robust_ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, compression=False, compressor='topk', nwpernode=1, sigma_scale=2.5, pretrain=None, density=0.01, prefix=None):
    global relative_path

    torch.cuda.set_device(dopt.rank()%nwpernode)
    rank = dopt.rank()
    if rank != 0:
        pretrain = None

    trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix=prefix+'-ds%s'%str(density), pretrain=pretrain, tb_writer=writer)

    init_epoch = trainer.get_train_epoch()
    init_iter = trainer.get_train_iter()

    trainer.set_train_epoch(comm.bcast(init_epoch))
    trainer.set_train_iter(comm.bcast(init_iter))

    def _error_handler(new_num_workers, new_rank):
        logger.info('Error info catched by trainer')
        trainer.update_nworker(new_num_workers, new_rank)

    compressor = compressor if compression else 'none'
    compressor = compressors[compressor]
    is_sparse = compression

    logger.info('Broadcast parameters....')
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    logger.info('Broadcast parameters finished....')

    norm_clip = None
    optimizer = dopt.DistributedOptimizer(trainer.optimizer, trainer.net.named_parameters(), compression=compressor, is_sparse=is_sparse, err_handler=_error_handler, layerwise_times=None, sigma_scale=sigma_scale, density=density, norm_clip=norm_clip, writer=writer)

    trainer.update_optimizer(optimizer)

    iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update)

    times = []
    NUM_OF_DISLAY = 40
    display = NUM_OF_DISLAY if iters_per_epoch > NUM_OF_DISLAY else iters_per_epoch-1
    logger.info('Start training ....')
    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            trainer.update_model()
            times.append(time.time()-s)
            if i % display == 0 and i > 0: 
                time_per_iter = np.mean(times)
                logger.info('Time per iteration including communication: %f. Speed: %f images/s, current density: %f', time_per_iter, batch_size * nsteps_update / time_per_iter, optimizer.get_current_density())
                times = []
        optimizer.add_train_epoch()
        if settings.PROFILING_INDEX and rank == 0 and epoch % 10 == 0:
            fn = os.path.join(relative_path, 'index-rank%d-epoch%d.npy' % (rank, epoch))
            key = list(optimizer._allreducer._update_index_counter.keys())[0]
            np.save(fn, optimizer._allreducer._update_index_counter[key].int().cpu().numpy())

        if settings.PROFILING_NORM:
            # For comparison purpose ===>
            fn = os.path.join(relative_path, 'gtopknorm-rank%d-epoch%d.npy' % (rank, epoch))
            fn2 = os.path.join(relative_path, 'randknorm-rank%d-epoch%d.npy' % (rank, epoch))
            fn3 = os.path.join(relative_path, 'upbound-rank%d-epoch%d.npy' % (rank, epoch))
            fn5 = os.path.join(relative_path, 'densestd-rank%d-epoch%d.npy' % (rank, epoch))
            arr = [] 
            arr2 = [] 
            arr3 = [] 
            arr4 = [] 
            arr5 = [] 
            for gtopk_norm, randk_norm, upbound, xnorm, dense_std in optimizer._allreducer._profiling_norms:
                arr.append(gtopk_norm)
                arr2.append(randk_norm)
                arr3.append(upbound)
                arr4.append(xnorm)
                arr5.append(dense_std)
            arr = np.array(arr)
            arr2 = np.array(arr2)
            arr3 = np.array(arr3)
            arr4 = np.array(arr4)
            arr5 = np.array(arr5)
            logger.info('[rank:%d][%d] gtopk norm mean: %f, std: %f', rank, epoch, np.mean(arr), np.std(arr))
            logger.info('[rank:%d][%d] randk norm mean: %f, std: %f', rank, epoch, np.mean(arr2), np.std(arr2))
            logger.info('[rank:%d][%d] upbound norm mean: %f, std: %f', rank, epoch, np.mean(arr3), np.std(arr3))
            logger.info('[rank:%d][%d] x norm mean: %f, std: %f', rank, epoch, np.mean(arr4), np.std(arr4))
            logger.info('[rank:%d][%d] dense std mean: %f, std: %f', rank, epoch, np.mean(arr5), np.std(arr5))
            np.save(fn, arr)
            np.save(fn2, arr2)
            np.save(fn3, arr3)
            np.save(fn5, arr5)
            # For comparison purpose <=== End
        optimizer._allreducer._profiling_norms = []
    optimizer.stop()
Esempio n. 4
0
 parser.add_argument('--dnn', type=str, default='resnet50', choices=_support_dnns, help='Specify the neural network for training')
 parser.add_argument('--data-dir', type=str, default='./data', help='Specify the data root path')
 parser.add_argument('--lr', type=float, default=0.1, help='Default learning rate')
 parser.add_argument('--max-epochs', type=int, default=90, help='Default maximum epochs to train')
 parser.add_argument('--pretrain', type=str, default=None, help='Specify the pretrain path')
 parser.set_defaults(compression=False)
 args = parser.parse_args()
 batch_size = args.batch_size * args.nsteps_update
 prefix = settings.PREFIX
 if args.compression:
     prefix = 'comp-' + args.compressor + '-' + prefix
 logdir = 'allreduce-%s/%s-n%d-bs%d-lr%.4f-ns%d-sg%.2f-ds%s' % (prefix, args.dnn, args.nworkers, batch_size, args.lr, args.nsteps_update, args.sigma_scale, str(args.density))
 relative_path = './logs/%s'%logdir
 utils.create_path(relative_path)
 rank = 0
 rank = dopt.rank()
 hvd.init()
 if rank == 0:
     tb_runs = './runs/%s'%logdir
     writer = SummaryWriter(tb_runs)
 logfile = os.path.join(relative_path, settings.hostname+'-'+str(rank)+'.log')
 hdlr = logging.FileHandler(logfile)
 hdlr.setFormatter(formatter)
 logger.addHandler(hdlr) 
 logger.info('Configurations: %s', args)
 
 logger.info('Interpreter: %s', sys.version)
 try:
     robust_ssgd(args.dnn, args.dataset, args.data_dir, args.nworkers, args.lr, args.batch_size, args.nsteps_update, args.max_epochs, args.compression, args.compressor, args.nwpernode, args.sigma_scale, args.pretrain, args.density, prefix)
 except Exception as e:
     logger.info('Main thread exception: %s', e)
Esempio n. 5
0
def ssgd(dnn,
         dataset,
         data_dir,
         nworkers,
         lr,
         batch_size,
         nsteps_update,
         max_epochs,
         nwpernode,
         pretrain,
         num_steps,
         compressor,
         density,
         threshold,
         gradient_path=None,
         tb=None,
         iratio=0.1,
         stages=1,
         partitions=0,
         ec_gradw=1.0,
         ec_memw=0.0,
         optimizer='nesterov',
         totaltime=0):
    global SPEED
    if not settings.USE_CPU:
        if nworkers > 1:
            rank = hvd.rank()
            torch.cuda.set_device(hvd.local_rank())  #%rank%nwpernode)
        else:
            rank = 0
            torch.cuda.set_device(rank)
    if rank != 0:
        pretrain = None

    #### CHECK whether to use GPU or CPU
    if settings.USE_CPU:
        trainer = DLTrainer(rank,
                            nworkers,
                            dist=False,
                            batch_size=batch_size,
                            is_weak_scaling=True,
                            ngpus=0,
                            data_dir=data_dir,
                            dataset=dataset,
                            dnn=dnn,
                            lr=lr,
                            nworkers=nworkers,
                            prefix='allreduce',
                            pretrain=pretrain,
                            num_steps=num_steps,
                            tb_writer=writer,
                            tb=tb,
                            optimizer_str=optimizer)
    else:
        trainer = DLTrainer(rank,
                            nworkers,
                            dist=False,
                            batch_size=batch_size,
                            is_weak_scaling=True,
                            ngpus=1,
                            data_dir=data_dir,
                            dataset=dataset,
                            dnn=dnn,
                            lr=lr,
                            nworkers=nworkers,
                            prefix='allreduce',
                            pretrain=pretrain,
                            num_steps=num_steps,
                            tb_writer=writer,
                            tb=tb,
                            optimizer_str=optimizer)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))
    is_sparse = True  #density < 1
    if not is_sparse:
        compressor = None

    if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE:
        seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer)
        layerwise_times = comm.bcast(layerwise_times, root=0)
        if rank == 0:
            logger.info('layerwise backward times: %s', list(layerwise_times))
            logger.info('layerwise backward sizes: %s', list(layerwise_sizes))
        logger.info('Bencharmked backward time: %f', np.sum(layerwise_times))
        logger.info('Model size: %d', np.sum(layerwise_sizes))
    else:
        seq_layernames, layerwise_times, layerwise_sizes = None, None, None

    norm_clip = None
    if dnn == 'lstm':
        norm_clip = 0.25
    elif dnn == 'lstman4':
        norm_clip = 400

    optimizer = hvd.DistributedOptimizer(
        trainer.optimizer,
        named_parameters=trainer.net.named_parameters(),
        compression=compressors[compressor],
        is_sparse=is_sparse,
        density=density,
        seq_layernames=seq_layernames,
        layerwise_times=layerwise_times,
        norm_clip=norm_clip,
        threshold=threshold,
        writer=writer,
        gradient_path=gradient_path,
        tb=tb,
        iratio=iratio,
        stages=stages,
        partitions=partitions,
        ec_gradw=ec_gradw,
        ec_memw=ec_memw)
    hvd.SPEED
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    iters_per_epoch = trainer.get_num_of_training_samples() // (
        nworkers * batch_size * nsteps_update)

    start = time.time()
    times = []
    noupdate_times = []
    logger.info('max_epochs: %d', max_epochs)
    display = settings.DISPLAY if iters_per_epoch > settings.DISPLAY else iters_per_epoch - 1

    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            noupdate_times.append(time.time() - s)

            trainer.update_model()
            torch.cuda.synchronize()
            times.append(time.time() - s)

            if i % display == 0 and i > 0:
                time_per_iter = np.mean(times)
                update_per_iter = time_per_iter - np.mean(noupdate_times)
                throughput = batch_size * nsteps_update / time_per_iter
                trainer.log_info(time_per_iter, throughput, update_per_iter)
                logger.warning(
                    'Time per iteration: %f, communication: %f, Speed: %f images/s',
                    time_per_iter, update_per_iter, throughput)
                times = []
                noupdate_times = []

        optimizer.increase_one_epoch()

        if totaltime > 0 and time.time() - start > totaltime:
            trainer.test(trainer.get_train_epoch())
            break
    if not (dataset == 'cifar10'):
        trainer.test(trainer.get_train_epoch())
Esempio n. 6
0
        prefix = 'comp-' + args.compressor + '-' + prefix
        if momentum_correction:
            prefix = 'mc-' + prefix
    logdir = 'allreduce-%s-thres-%dkbytes/%s-n%d-bs%d-lr%.4f-ns%d-ds%s' % (
        prefix, args.threshold / 1024, args.dnn, args.nworkers, batch_size,
        args.lr, args.nsteps_update, str(args.density))
    relative_path = './logs/%s' % logdir
    gradient_relative_path = None
    utils.create_path(relative_path)
    if settings.LOGGING_GRADIENTS:
        gradient_relative_path = '%s/gradients/%s' % (args.saved_dir, logdir)
        utils.create_path(gradient_relative_path)
    rank = 0
    if args.nworkers > 1:
        hvd.init()
        rank = hvd.rank()
    if rank == 0:
        tb_runs = './runs/%s' % logdir
        writer = None  #SummaryWriter(tb_runs)
    logfile = os.path.join(relative_path,
                           settings.hostname + '-' + str(rank) + '.log')
    hdlr = logging.FileHandler(logfile)
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.info('Configurations: %s', args)

    if hvd.rank() == 0:
        wandb.init(project='gtopk',
                   entity='shyhuai',
                   name=logfile,
                   config=args)