コード例 #1
0
def init(args):
    log.set_rank(args.rank)
    if args.output_dir is not None:
        log.set_directory(args.output_dir)
    log.set_level(args.verbosity)

    args = validate_args(args)

    if args.apex:
        from apex import amp

    log.info('Configurations:\n' + pformat(args.__dict__))

    log.info('world_size = %d, batch_size = %d, device = %s, backend = %s',
             args.world_size, args.batch_size, args.device, args.backend)

    if not args.cpu:
        torch.cuda.set_device(args.local_rank)
        torch.backends.cudnn.benchmark = True

    if args.deterministic:
        torch.manual_seed(args.rank)
        np.random.seed(args.rank)
        torch.backends.cudnn.deterministic = True
        torch.cuda.manual_seed(args.rank)

    dist.init_process_group(backend=args.backend)
コード例 #2
0
def save_data(train_res, val_res, fname, output_dir='data'):

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if output_dir.endswith('/'):
        output_dir = output_dir[:-1]

    header = 'iterations train_time run_time loss'
    if len(train_res[0]) == 5:
        header += ' accuracy'

    def _save(res, name):
        res = np.array(res)
        np.savetxt(output_dir + '/full_' + fname + name,
                   res,
                   header=header,
                   comments='')

        # Downsample if needed
        if len(res) > 500:
            idx = np.r_[:50, 50:500:5, 500:len(res):int((len(res)) / 100)]
            res = res[idx]

        np.savetxt(output_dir + '/' + fname + name,
                   res,
                   header=header,
                   comments='')

    _save(train_res, '_train.txt')
    _save(val_res, '_val.txt')

    log.info('Data saved to %s/[full_]%s_[train/val].txt', output_dir, fname)
コード例 #3
0
ファイル: gradient_parallel.py プロジェクト: liboyue/ceddl
    def __init__(self, *args, sync_freq=1, fp16_grads=False, **kwargs):
        r"""Init function.

        Args:
            module:
                The module to be wrapped.

            sync_freq:
                Number of steps between communications.

            fp16_grads:
                Whether to use fp16 gradients.

            kwargs:
                Other args torch.nn.parallel.DistributedDataParallel requires.
        """
        log.info('Using %s', self.__class__.__name__)

        # Test PyTorch version
        if torch.__version__ < '1.7.0':
            log.FATAL(
                "Please install PyTorch v1.7.0-rc1 to use DistributedGradientParallel!"
            )

        if dist.get_backend() != 'nccl':
            log.warn('DistributedGradientParallel performs better with NCCL')

        super().__init__(*args, **kwargs)

        self.sync_freq = sync_freq
        self.fp16_grads = fp16_grads
        self._iter_counter = 0

        if self.fp16_grads:
            log.info('Using fp16 gradients')
            if dist.get_backend() != 'nccl':
                self._register_comm_hook(state=None,
                                         hook=fp16_compress_hook_gloo)
            else:
                self._register_comm_hook(state=None,
                                         hook=fp16_compress_hook_nccl)

        def _forward_pre_hook(*args, **kwargs):
            if self.training:
                # Update iteration counter
                self._iter_counter += 1
                self._iter_counter %= self.sync_freq

                log.debug('_forward_pre_hook called on %s, _iter_counter %d',
                          self.device, self._iter_counter)

                if self._iter_counter == 0:
                    self.require_backward_grad_sync = True
                else:
                    self.require_backward_grad_sync = False

        self.register_forward_pre_hook(_forward_pre_hook)
コード例 #4
0
def wrap_model(model, args, optimizer=None):

    if args.apex:
        log.info('Apex wrapping')
        from apex import amp
        model, optimizer = amp.initialize(
            model,
            optimizer,
            opt_level=args.opt_level,
            keep_batchnorm_fp32=args.keep_batchnorm_fp32,
            loss_scale=args.loss_scale)

    if args.ddp == 'DistributedDataParallel':
        model = ceddl.parallel.DistributedDataParallel(model, **args.__dict__)

    elif args.ddp == 'SparseDistributedDataParallel':
        model = ceddl.parallel.SparseDistributedDataParallel(
            model, **args.__dict__)

    elif args.ddp == 'NetworkDataParallel':
        model = ceddl.parallel.NetworkDataParallel(model, **args.__dict__)

    else:
        if args.cpu:
            device_ids = None
        else:
            device_ids = [args.rank]

        if args.ddp == 'pytorch':
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=device_ids)
        elif args.ddp == 'DistributedGradientParallel':
            model = ceddl.parallel.DistributedGradientParallel(
                model,
                fp16_grads=args.fp16_grads,
                sync_freq=args.sync_freq,
                device_ids=device_ids)

    return model, optimizer
コード例 #5
0
ファイル: mnist.py プロジェクト: liboyue/ceddl
                                            args.batch_size)

model = Net().to(args.device)

args.use_ref_module = True
args.tracking_loader = train_loader
args.criterion = nn.CrossEntropyLoss()
model, optimizer = utils.wrap_model(model, args)
optimizer = ceddl.optim.NetworkSVRG(model, lr=args.lr)

# optimizer = torch.optim.SGD(model.parameters(),
# lr=args.lr,
# weight_decay=args.weight_decay,
# momentum=args.momentum)

log.info('Model is on %s', next(model.parameters()).device)

classes = [int(i) for i in range(10)]
criterion = nn.CrossEntropyLoss()

train_res, val_res = utils.train(model,
                                 criterion,
                                 optimizer,
                                 train_loader,
                                 args,
                                 exp_name='mnist',
                                 val_loader=val_loader,
                                 classes=classes)

log.info('Process %d exited', args.rank)
コード例 #6
0
utils.init(args)
# log.set_allowed_ranks(list(range(args.world_size)))

# Local data
local_n_samples = int(args.n_samples / args.world_size)
X = torch.zeros(local_n_samples, args.dim)
Y = torch.zeros(local_n_samples, 1)

if args.rank == 0:
    # Set the random seed so the data is the same in every run
    np.random.seed(0)

    # Generate random data at node 0,
    X_total, Y_total, w_0, loss_0 = generate_data(args.n_samples, args.dim, args.condition_number, args.noise_variance)
    log.info('Data generated, the best loss is %.7f' % loss_0)

    # then send to all other processes
    dist.scatter(X, [_ for _ in X_total.split(local_n_samples)])
    dist.scatter(Y, [_ for _ in Y_total.split(local_n_samples)])

else:
    dist.scatter(X)
    dist.scatter(Y)


dataset = torch.utils.data.TensorDataset(X, Y)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
val_loader = torch.utils.data.DataLoader(dataset, batch_size=local_n_samples, shuffle=False)

コード例 #7
0
def validate(model, val_loader, criterion, classes=None, device=None):
    log.info('Validating model')

    losses = []

    if classes is not None:
        confusion_matrix = np.zeros((len(classes), len(classes)))

    for data, target in val_loader:
        target = target.to(device=device, non_blocking=True)
        data = data.to(device=device, non_blocking=True)
        output = model(data)

        loss = criterion(output, target)
        losses.append(loss.cpu().item())

        if classes is not None:
            _, predicted = torch.max(output, 1)
            for i in range(len(target)):
                l = target[i]
                p = predicted[i]
                confusion_matrix[l][p] += 1

    loss = np.mean(losses) / dist.get_world_size()
    loss = torch.Tensor([loss]).to(device)

    dist.all_reduce(loss)

    loss = loss.cpu().item()

    if classes is not None:
        confusion_matrix = torch.from_numpy(confusion_matrix).to(device)
        dist.all_reduce(confusion_matrix)
        confusion_matrix = confusion_matrix.cpu().numpy()

    log.debug('Synchronized from other wokers')

    if classes is not None:

        acc = np.diag(confusion_matrix).sum() / confusion_matrix.sum()

        confusion_matrix /= confusion_matrix.sum(axis=1)
        # log.debug(confusion_matrix)

        max_len = str(max([len(str(c)) for c in classes]))
        if len(classes) > 10:
            log.info('Accuracy of first 5 classes')
            for i in range(5):
                log.info('%-' + max_len + 's: %8.5f%%', classes[i],
                         100 * confusion_matrix[i, i])

            log.info('Accuracy of last 5 classes')
            for i in range(len(classes) - 5, len(classes)):
                log.info('%-' + max_len + 's: %8.5f%%', classes[i],
                         100 * confusion_matrix[i, i])
        else:
            log.info('Accuracy of each class')
            for i in range(len(classes)):
                log.info('%-' + max_len + 's: %8.5f%%', classes[i],
                         100 * confusion_matrix[i, i])

        log.info('Validation loss %.5f, accuracy %.5f%%', loss, acc * 100)

        return loss, acc

    else:
        log.info('Validation loss %.5f', loss)
        return [loss]
コード例 #8
0
def train(model,
          criterion,
          optimizer,
          train_loader,
          args,
          val_loader=None,
          exp_name=None,
          classes=None,
          scheduler=None):
    if args.apex:
        from apex import amp

    def _val():
        if args.val_interval is not None:
            val_start = time()
            model.eval()
            val_res.append([
                i, train_time, run_time, *validate(model,
                                                   val_loader,
                                                   criterion,
                                                   classes=classes,
                                                   device=args.device)
            ])
            model.train()
            val_end = time()
            return val_end - val_start
        else:
            return 0

    def _save():
        if args.rank == 0:
            fname = get_fname(args, exp_name=None)
            save_data(train_res, val_res, fname, output_dir=args.output_dir)
            log.debug('Data saved to %s', fname)

    def _eta():
        _time = train_time / i * (total_batches - i)
        if args.val_interval is not None:
            _time += val_time / (i // args.val_interval + 1) * (
                (total_batches - i) // args.val_interval + 1)

        h = _time / 3600
        if h > 1:
            return "%.2fh" % h

        m = _time / 60
        if m > 1:
            return "%.2fm" % m

        return "%.2fs" % _time

    total_batches = len(train_loader) * args.epochs
    train_res = []
    val_res = []
    running_loss = []
    running_acc = []
    i = 0
    val_time = run_time = train_time = 0
    train_start = time()
    printed = False

    val_time += _val()

    log.info('Training started')
    model.train()
    optimizer.zero_grad()

    if args.gradient_accumulation and args.ddp == 'pytorch':
        model.require_backward_grad_sync = False

    for epoch in range(1, args.epochs + 1):

        for _, (data, target) in enumerate(train_loader):

            i += 1

            target = target.to(device=args.device, non_blocking=True)
            data = data.to(device=args.device, non_blocking=True)

            if args.ddp == 'pytorch':
                if args.gradient_accumulation and i % args.sync_freq != 0:
                    model.require_backward_grad_sync = False
                else:
                    model.require_backward_grad_sync = True

            # ==== Step begin ====
            output = model(data)
            loss = criterion(output, target)

            if args.gradient_accumulation:
                loss /= args.sync_freq

            if args.apex:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if args.ddp == 'DistributedGradientParallel' and printed == False:
                for n, p in model.named_parameters():
                    log.warn(
                        '%s.grad.dtype = %s, max difference between original grad and half precision grad is %f',
                        n, p.grad.dtype,
                        (p.grad - p.grad.clone().half()).abs().max())
                printed = True

            if not args.gradient_accumulation or i % args.sync_freq == 0:
                log.debug('[%d/%d, %5d/%d] optimizer step', epoch, args.epochs,
                          i, total_batches)
                optimizer.step()
                optimizer.zero_grad()

            loss = loss.item()
            running_loss.append(loss)
            if classes is not None:
                acc = accuracy(output, target).item()
                running_acc.append(acc)

            # ==== Step done ====

            current_time = time()
            run_time = current_time - train_start
            train_time = run_time - val_time

            if args.gradient_accumulation:
                tmp_res = [i, train_time, run_time, loss * args.sync_freq]
            else:
                tmp_res = [i, train_time, run_time, loss]
            if classes is not None:
                tmp_res += [acc]

            train_res.append(tmp_res)

            if i % args.disp_interval == 0:
                log.info(
                    '[%d/%d, %5d/%d] local running loss %.5f, local running acc %.5f%%, average train time %.4f seconds per batch, eta %s',
                    epoch, args.epochs, i, total_batches,
                    np.mean(running_loss),
                    np.mean(running_acc) * 100, train_time / i, _eta())
                running_loss = []
                running_acc = []

            if args.val_interval is not None and i % args.val_interval == 0:
                val_time += _val()
                # Update saved data after every validation
                _save()

            # end for

        current_time = time()
        run_time = current_time - train_start
        train_time = run_time - val_time

        log.info(
            'Training epoch %d ends, total run time %.4f seconds, average train time %.4f seconds per batch',
            epoch, run_time, train_time / i)

        if scheduler is not None:
            log.debug('schedule.step() called')
            scheduler.step()

    if args.val_interval is not None and i % args.val_interval != 0:
        val_time += _val()

    current_time = time()
    run_time = current_time - train_start
    train_time = run_time - val_time

    _save()

    if classes is not None:
        best_acc = max([x[-1] for x in val_res])
        log.info(
            'Training finished, %d epochs, final val loss %.5f, final val acc %.5f%%, best val acc %.5f%%',
            epoch, val_res[-1][-2], val_res[-1][-1] * 100, best_acc * 100)
    else:
        log.info('Training finished, %d epochs, final val loss %.5f', epoch,
                 val_res[-1][-1])

    return train_res, val_res
コード例 #9
0
    def __init__(self,
                 module,
                 world_local_size=None,
                 node_rank=None,
                 local_rank=None,
                 sync_freq=1,
                 num_streams=1,
                 premultiplier=None,
                 **kwargs):
        r"""Init function.

        Args:
            module:
                The module to be wrapped.

            sync_freq:
                Number of steps between communications.

            num_streams:
                Number of CUDA streams to use for communication.

            premultiplier:
                The multiplier to be applied before communication. If not none,
                parameters will be multiplied by pre-multiplier before
                communication, then divided by the pre-multiplier after
                communication.
        """

        super().__init__()

        log.info('Using %s', self.__class__.__name__)

        self.module = module
        self.device = next(self.module.parameters()).device

        # Assume torch.dist is initialized
        self.rank = dist.get_rank()
        self.world_size = dist.get_world_size()
        self.local_rank = local_rank if local_rank is not None else self.rank
        self.node_rank = node_rank if node_rank is not None else 0
        self.world_local_size = world_local_size if world_local_size is not None else 1

        # When the counter equals to sync_freq, perform communication and reset
        self.premultiplier = premultiplier
        self.sync_freq = sync_freq
        self._iter_counter = 0

        self.param_info = [{
            'numel': param.numel(),
            'shape': param.shape
        } for param in self.parameters()]
        self.flat_parameters, self.flat_indexes = self.flatten_tensors(
            list(self.parameters()))
        self.assign_unflattened_tensors(self.parameters(),
                                        self.flat_parameters)

        log.debug('Broadcasting init params')
        for param in self.flat_parameters:
            dist.broadcast(param, 0)
        log.debug('Broadcasting init params done')

        self.num_streams = num_streams
        if self.device.type == 'cuda':
            self.streams = [
                torch.cuda.Stream() for _ in range(self.num_streams)
            ]
コード例 #10
0
    def __init__(self, world_size, **kwargs):
        cycle = int(np.log(world_size - 1) / np.log(2))
        super().__init__(world_size, cycle=cycle, **kwargs)

        log.info('Exponential graph initialized with cycle %d', self.cycle)
コード例 #11
0
ファイル: complete_graph.py プロジェクト: liboyue/ceddl
    def __init__(self, world_size, **kwargs):
        super().__init__(world_size, cycle=1, **kwargs)

        log.info('Complete graph initialized')
コード例 #12
0
ファイル: complete_graph.py プロジェクト: liboyue/ceddl
import numpy as np

from .communication_graph import CommunicationGraph
from ceddl import log


class CompleteGraph(CommunicationGraph):
    def __init__(self, world_size, **kwargs):
        super().__init__(world_size, cycle=1, **kwargs)

        log.info('Complete graph initialized')

    def update_graph(self):
        # Don't update
        pass

    def generate_adjacency_matrix(self, t):
        return np.ones((self.world_size, self.world_size))


if __name__ == '__main__':
    a = CompleteGraph(5, n_peers=1)
    log.info(str(a.adjacency_matrix))
    log.info(a.cycle)