Esempio n. 1
0
def evaluate(model_path, dnn, dataset, data_dir, nepochs, allreduce=False):
    items = model_path.split('/')[-1].split('-')
    dnn = items[0]
    lr = float(items[-1][2:])
    batch_size = int(items[2][2:])
    #batch_size = 1 #int(items[2][2:])
    rank = 0
    nworkers = 1

    trainer = DLTrainer(rank,
                        1,
                        dist=False,
                        ngpus=1,
                        batch_size=batch_size,
                        is_weak_scaling=True,
                        dataset=dataset,
                        dnn=dnn,
                        data_dir=data_dir,
                        lr=lr,
                        nworkers=nworkers)
    best_acc = 0.0
    start_epoch = 1
    for i in range(start_epoch, nepochs + 1):
        filename = '%s-rank%d-epoch%d.pth' % (dnn, rank, i)
        fn = os.path.join(model_path, filename)
        if i == nepochs and not allreduce and False:
            trainers = []
            for j in range(nworkers):
                filename = '%s-rank%d-epoch%d.pth' % (dnn, j, i)
                fn = os.path.join(model_path, filename)
                tr = DLTrainer(rank,
                               1,
                               dist=False,
                               ngpus=1,
                               batch_size=batch_size,
                               is_weak_scaling=True,
                               dataset=dataset,
                               dnn=dnn,
                               data_dir=data_dir,
                               lr=lr,
                               nworkers=nworkers)
                tr.load_model_from_file(fn)
                trainers.append(tr)
            model_average(trainers)
            trainer = trainers[0]
        else:
            trainer.load_model_from_file(fn)
        acc = trainer.test(i)
        if i == start_epoch:
            best_acc = acc
        else:
            if dnn in ['lstm', 'lstman4']:  # the lower the better
                if best_acc > acc:
                    best_acc = acc
            else:
                if best_acc < acc:
                    best_acc = acc
        logger.info('Best validation accuracy or perprexity: %f', best_acc)
Esempio n. 2
0
def ssgd(dnn,
         dataset,
         data_dir,
         nworkers,
         lr,
         batch_size,
         nsteps_update,
         max_epochs,
         nwpernode,
         pretrain,
         num_steps,
         compressor,
         density,
         threshold,
         gradient_path=None):
    rank = hvd.rank()
    torch.cuda.set_device(rank % nwpernode)
    if rank != 0:
        pretrain = None
    trainer = DLTrainer(rank,
                        nworkers,
                        dist=False,
                        batch_size=batch_size,
                        is_weak_scaling=True,
                        ngpus=1,
                        data_dir=data_dir,
                        dataset=dataset,
                        dnn=dnn,
                        lr=lr,
                        nworkers=nworkers,
                        prefix='allreduce',
                        pretrain=pretrain,
                        num_steps=num_steps,
                        tb_writer=writer)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))
    is_sparse = density < 1
    #if not is_sparse:
    #    compressor = None

    if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE:
        seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer)
        layerwise_times = comm.bcast(layerwise_times, root=0)
        if rank == 0:
            logger.info('layerwise backward times: %s', list(layerwise_times))
            logger.info('layerwise backward sizes: %s', list(layerwise_sizes))
        logger.info('Bencharmked backward time: %f', np.sum(layerwise_times))
        logger.info('Model size: %d', np.sum(layerwise_sizes))
    else:
        seq_layernames, layerwise_times, layerwise_sizes = None, None, None

    norm_clip = None
    if dnn == 'lstm':
        norm_clip = 0.25
    elif dnn == 'lstman4':
        norm_clip = 400

    optimizer = hvd.DistributedOptimizer(
        trainer.optimizer,
        named_parameters=trainer.net.named_parameters(),
        compression=compressors[compressor],
        is_sparse=is_sparse,
        density=density,
        seq_layernames=seq_layernames,
        layerwise_times=layerwise_times,
        norm_clip=norm_clip,
        threshold=threshold,
        writer=writer,
        gradient_path=gradient_path)
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    iters_per_epoch = trainer.get_num_of_training_samples() // (
        nworkers * batch_size * nsteps_update)

    times = []
    logger.info('max_epochs: %d', max_epochs)
    display = 40 if iters_per_epoch > 40 else iters_per_epoch - 1
    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            trainer.update_model()
            times.append(time.time() - s)
            if i % display == 0 and i > 0:
                time_per_iter = np.mean(times)
                logger.warn(
                    'Time per iteration including communication: %f, Speed: %f images/s',
                    time_per_iter, batch_size * nsteps_update / time_per_iter)
                times = []
        optimizer.increase_one_epoch()
Esempio n. 3
0
def ssgd_with_horovod(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, nwpernode, pretrain, num_steps = 1):
    rank = hvd.rank()
    torch.cuda.set_device(rank%nwpernode)
    if rank != 0:
        pretrain = None
    trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))

    optimizer = hvd.DistributedOptimizer(trainer.optimizer, named_parameters=trainer.net.named_parameters())
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update)

    times = []
    display = 20 if iters_per_epoch > 20 else iters_per_epoch-1
    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            trainer.update_model()
            times.append(time.time()-s)
            if i % display == 0 and i > 0: 
                time_per_iter = np.mean(times)
                logger.info('Time per iteration including communication: %f. Speed: %f images/s', time_per_iter, batch_size * nsteps_update / time_per_iter)
                times = []
Esempio n. 4
0
def robust_ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, compression=False, compressor='topk', nwpernode=1, sigma_scale=2.5, pretrain=None, density=0.01, prefix=None):
    global relative_path

    torch.cuda.set_device(dopt.rank()%nwpernode)
    rank = dopt.rank()
    if rank != 0:
        pretrain = None

    trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix=prefix+'-ds%s'%str(density), pretrain=pretrain, tb_writer=writer)

    init_epoch = trainer.get_train_epoch()
    init_iter = trainer.get_train_iter()

    trainer.set_train_epoch(comm.bcast(init_epoch))
    trainer.set_train_iter(comm.bcast(init_iter))

    def _error_handler(new_num_workers, new_rank):
        logger.info('Error info catched by trainer')
        trainer.update_nworker(new_num_workers, new_rank)

    compressor = compressor if compression else 'none'
    compressor = compressors[compressor]
    is_sparse = compression

    logger.info('Broadcast parameters....')
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    logger.info('Broadcast parameters finished....')

    norm_clip = None
    optimizer = dopt.DistributedOptimizer(trainer.optimizer, trainer.net.named_parameters(), compression=compressor, is_sparse=is_sparse, err_handler=_error_handler, layerwise_times=None, sigma_scale=sigma_scale, density=density, norm_clip=norm_clip, writer=writer)

    trainer.update_optimizer(optimizer)

    iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update)

    times = []
    NUM_OF_DISLAY = 40
    display = NUM_OF_DISLAY if iters_per_epoch > NUM_OF_DISLAY else iters_per_epoch-1
    logger.info('Start training ....')
    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            trainer.update_model()
            times.append(time.time()-s)
            if i % display == 0 and i > 0: 
                time_per_iter = np.mean(times)
                logger.info('Time per iteration including communication: %f. Speed: %f images/s, current density: %f', time_per_iter, batch_size * nsteps_update / time_per_iter, optimizer.get_current_density())
                times = []
        optimizer.add_train_epoch()
        if settings.PROFILING_INDEX and rank == 0 and epoch % 10 == 0:
            fn = os.path.join(relative_path, 'index-rank%d-epoch%d.npy' % (rank, epoch))
            key = list(optimizer._allreducer._update_index_counter.keys())[0]
            np.save(fn, optimizer._allreducer._update_index_counter[key].int().cpu().numpy())

        if settings.PROFILING_NORM:
            # For comparison purpose ===>
            fn = os.path.join(relative_path, 'gtopknorm-rank%d-epoch%d.npy' % (rank, epoch))
            fn2 = os.path.join(relative_path, 'randknorm-rank%d-epoch%d.npy' % (rank, epoch))
            fn3 = os.path.join(relative_path, 'upbound-rank%d-epoch%d.npy' % (rank, epoch))
            fn5 = os.path.join(relative_path, 'densestd-rank%d-epoch%d.npy' % (rank, epoch))
            arr = [] 
            arr2 = [] 
            arr3 = [] 
            arr4 = [] 
            arr5 = [] 
            for gtopk_norm, randk_norm, upbound, xnorm, dense_std in optimizer._allreducer._profiling_norms:
                arr.append(gtopk_norm)
                arr2.append(randk_norm)
                arr3.append(upbound)
                arr4.append(xnorm)
                arr5.append(dense_std)
            arr = np.array(arr)
            arr2 = np.array(arr2)
            arr3 = np.array(arr3)
            arr4 = np.array(arr4)
            arr5 = np.array(arr5)
            logger.info('[rank:%d][%d] gtopk norm mean: %f, std: %f', rank, epoch, np.mean(arr), np.std(arr))
            logger.info('[rank:%d][%d] randk norm mean: %f, std: %f', rank, epoch, np.mean(arr2), np.std(arr2))
            logger.info('[rank:%d][%d] upbound norm mean: %f, std: %f', rank, epoch, np.mean(arr3), np.std(arr3))
            logger.info('[rank:%d][%d] x norm mean: %f, std: %f', rank, epoch, np.mean(arr4), np.std(arr4))
            logger.info('[rank:%d][%d] dense std mean: %f, std: %f', rank, epoch, np.mean(arr5), np.std(arr5))
            np.save(fn, arr)
            np.save(fn2, arr2)
            np.save(fn3, arr3)
            np.save(fn5, arr5)
            # For comparison purpose <=== End
        optimizer._allreducer._profiling_norms = []
    optimizer.stop()
Esempio n. 5
0
def ssgd(dnn,
         dataset,
         data_dir,
         nworkers,
         lr,
         batch_size,
         nsteps_update,
         max_epochs,
         nwpernode,
         pretrain,
         num_steps,
         compressor,
         density,
         threshold,
         gradient_path=None,
         tb=None,
         iratio=0.1,
         stages=1,
         partitions=0,
         ec_gradw=1.0,
         ec_memw=0.0,
         optimizer='nesterov',
         totaltime=0):
    global SPEED
    if not settings.USE_CPU:
        if nworkers > 1:
            rank = hvd.rank()
            torch.cuda.set_device(hvd.local_rank())  #%rank%nwpernode)
        else:
            rank = 0
            torch.cuda.set_device(rank)
    if rank != 0:
        pretrain = None

    #### CHECK whether to use GPU or CPU
    if settings.USE_CPU:
        trainer = DLTrainer(rank,
                            nworkers,
                            dist=False,
                            batch_size=batch_size,
                            is_weak_scaling=True,
                            ngpus=0,
                            data_dir=data_dir,
                            dataset=dataset,
                            dnn=dnn,
                            lr=lr,
                            nworkers=nworkers,
                            prefix='allreduce',
                            pretrain=pretrain,
                            num_steps=num_steps,
                            tb_writer=writer,
                            tb=tb,
                            optimizer_str=optimizer)
    else:
        trainer = DLTrainer(rank,
                            nworkers,
                            dist=False,
                            batch_size=batch_size,
                            is_weak_scaling=True,
                            ngpus=1,
                            data_dir=data_dir,
                            dataset=dataset,
                            dnn=dnn,
                            lr=lr,
                            nworkers=nworkers,
                            prefix='allreduce',
                            pretrain=pretrain,
                            num_steps=num_steps,
                            tb_writer=writer,
                            tb=tb,
                            optimizer_str=optimizer)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))
    is_sparse = True  #density < 1
    if not is_sparse:
        compressor = None

    if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE:
        seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer)
        layerwise_times = comm.bcast(layerwise_times, root=0)
        if rank == 0:
            logger.info('layerwise backward times: %s', list(layerwise_times))
            logger.info('layerwise backward sizes: %s', list(layerwise_sizes))
        logger.info('Bencharmked backward time: %f', np.sum(layerwise_times))
        logger.info('Model size: %d', np.sum(layerwise_sizes))
    else:
        seq_layernames, layerwise_times, layerwise_sizes = None, None, None

    norm_clip = None
    if dnn == 'lstm':
        norm_clip = 0.25
    elif dnn == 'lstman4':
        norm_clip = 400

    optimizer = hvd.DistributedOptimizer(
        trainer.optimizer,
        named_parameters=trainer.net.named_parameters(),
        compression=compressors[compressor],
        is_sparse=is_sparse,
        density=density,
        seq_layernames=seq_layernames,
        layerwise_times=layerwise_times,
        norm_clip=norm_clip,
        threshold=threshold,
        writer=writer,
        gradient_path=gradient_path,
        tb=tb,
        iratio=iratio,
        stages=stages,
        partitions=partitions,
        ec_gradw=ec_gradw,
        ec_memw=ec_memw)
    hvd.SPEED
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    iters_per_epoch = trainer.get_num_of_training_samples() // (
        nworkers * batch_size * nsteps_update)

    start = time.time()
    times = []
    noupdate_times = []
    logger.info('max_epochs: %d', max_epochs)
    display = settings.DISPLAY if iters_per_epoch > settings.DISPLAY else iters_per_epoch - 1

    for epoch in range(max_epochs):
        hidden = None
        if dnn == 'lstm':
            hidden = trainer.net.init_hidden()
        for i in range(iters_per_epoch):
            s = time.time()
            optimizer.zero_grad()
            for j in range(nsteps_update):
                if j < nsteps_update - 1 and nsteps_update > 1:
                    optimizer.local = True
                else:
                    optimizer.local = False
                if dnn == 'lstm':
                    _, hidden = trainer.train(1, hidden=hidden)
                else:
                    trainer.train(1)
            if dnn == 'lstm':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25)
            elif dnn == 'lstman4':
                optimizer.synchronize()
                torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400)
            noupdate_times.append(time.time() - s)

            trainer.update_model()
            torch.cuda.synchronize()
            times.append(time.time() - s)

            if i % display == 0 and i > 0:
                time_per_iter = np.mean(times)
                update_per_iter = time_per_iter - np.mean(noupdate_times)
                throughput = batch_size * nsteps_update / time_per_iter
                trainer.log_info(time_per_iter, throughput, update_per_iter)
                logger.warning(
                    'Time per iteration: %f, communication: %f, Speed: %f images/s',
                    time_per_iter, update_per_iter, throughput)
                times = []
                noupdate_times = []

        optimizer.increase_one_epoch()

        if totaltime > 0 and time.time() - start > totaltime:
            trainer.test(trainer.get_train_epoch())
            break
    if not (dataset == 'cifar10'):
        trainer.test(trainer.get_train_epoch())
Esempio n. 6
0
def ssgd_with_horovod(job, mode, start_dt):

    rank = hvd.rank()
    gpu_id = -1
    if job.cuda:
        ngpus = 1
        gpu_id = job.device_ids[rank]
        torch.cuda.set_device(gpu_id)
    else:
        ngpus = -1

    if rank != 0:
        pretrain = None

    trainer = DLTrainer(rank,
                        dist=False,
                        batch_size=job.batch_size,
                        is_weak_scaling=True,
                        ngpus=ngpus,
                        data_dir=job.data_dir,
                        dataset=job.dataset,
                        dnn=job.dnn,
                        lr=job.lr,
                        nworkers=job.nworkers,
                        prefix='allreduce')

    if mode == 'simulate':
        synt_model = torch.rand(4, job.model_size * (2**20) / 4 / 4)

    init_epoch = torch.ones(1) * trainer.get_train_epoch()
    init_iter = torch.ones(1) * trainer.get_train_iter()
    trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0]))
    trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0]))

    optimizer = hvd.DistributedOptimizer(
        trainer.optimizer, named_parameters=trainer.net.named_parameters())
    hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0)
    trainer.update_optimizer(optimizer)
    #iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update)

    times = []

    #display = 20 if iters_per_epoch > 20 else iters_per_epoch-1
    display = 1
    nsteps_update = job.nsteps_update

    # to integrate in LSTM code
    hidden = None
    dnn = job.dnn
    if dnn == 'lstm':
        hidden = trainer.net.init_hidden()

    logger.info(
        "(Server %s, Job %d, Rank %d, GPU %d) Wait until start time: %s." %
        (settings.hostname, job.job_id, rank, gpu_id, start_dt))
    pause.until(start_dt)

    start_slot = time.time()
    for i in range(job.iters):

        s = time.time()
        optimizer.zero_grad()
        optimizer.local = False
        # forward
        time.sleep(job.get_forward_schedule(rank, i) * 0.001)
        fw_start_slot = int((time.time() - start_slot) * 1000)
        if mode == 'simulate':
            time.sleep((job.fw_time) * 0.001)
        else:
            #optimizer.local = True
            if dnn == 'lstm':
                #print(hidden)
                #print(" =========== j : %d ===========", j)
                _, hidden = trainer.train_forward(1, hidden=hidden)
            else:
                trainer.train_forward(1)
                #trainer.train(1)
        fw_end_slot = int((time.time() - start_slot) * 1000)
        logger.info(
            "(Server %s, Job %d, Rank %d, GPU %d) Forward task %d started at slot=%d, ended at slot=%d, duration=%d."
            % (settings.hostname, job.job_id, rank, gpu_id, i, fw_start_slot,
               fw_end_slot, fw_end_slot - fw_start_slot))

        # backward
        time.sleep(job.get_backward_schedule(rank, i) * 0.001)
        bw_start_slot = int((time.time() - start_slot) * 1000)
        if mode == 'simulate':
            time.sleep((job.bw_time) * 0.001)
        else:
            trainer.train_backward(1)
            #trainer.train(1)
            pass
        bw_end_slot = int((time.time() - start_slot) * 1000)
        logger.info(
            "(Server %s, Job %d, Rank %d, GPU %d) Backward task %d started at slot=%d, ended at slot=%d, duration=%d."
            % (settings.hostname, job.job_id, rank, gpu_id, i, bw_start_slot,
               bw_end_slot, bw_end_slot - bw_start_slot))

        # communication
        time.sleep(job.get_communication_schedule(rank, i) * 0.001)
        comm_start_slot = int((time.time() - start_slot) * 1000)
        if mode == 'simulate':
            hvd.allreduce(synt_model)
            pass
        else:
            trainer.update_model()
        comm_end_slot = int((time.time() - start_slot) * 1000)
        logger.info(
            "(Server %s, Job %d, Rank %d, GPU %d) Comm task %d started at slot=%d, ended at slot=%d, duration=%d."
            % (settings.hostname, job.job_id, rank, gpu_id, i, comm_start_slot,
               comm_end_slot, comm_end_slot - comm_start_slot))

        times.append(time.time() - s)
        #if i % display == 0 and i > 0:
        #    time_per_iter = np.mean(times)
        #    logger.info('Time per iteration including communication: %f. Speed: %f images/s', time_per_iter, job.batch_size * nsteps_update / time_per_iter)
        #    times = []

    end_slot = time.time()
    logger.info(
        "(Server %s, Job %d, Rank %d, GPU %d) Job ended. Total time is % s." %
        (settings.hostname, job.job_id, rank, gpu_id,
         int((end_slot - start_slot) * 1000)))