def restore_model(self):
        if self._rank == 0:
            return trainer_eventID.restore_model(self)
        else:
            return None


        if self.args.distributed_mode == "horovod":

            # Broadcast the global step:
            self._global_step = hvd.broadcast_object(self._global_step, root_rank = 0)

            # Broadcast the state of the model:
            hvd.broadcast_parameters(self._net.state_dict(), root_rank = 0)

            # Broadcast the optimizer state:
            hvd.broadcast_optimizer_state(self._opt, root_rank = 0)

            # Horovod doesn't actually move the optimizer onto a GPU:
            if self.args.compute_mode == "GPU":
                for state in self._opt.state.values():
                    for k, v in state.items():
                        if torch.is_tensor(v):
                            state[k] = v.cuda()



            # Broadcast the LR Schedule state:
            state_dict = hvd.broadcast_object(self.lr_scheduler.state_dict(), root_rank = 0)

        elif self.args.distributed_mode == "DDP":

            if self.args.compute_mode == "GPU":
                self._net.cuda()

            self._net = torch.nn.parallel.DistributedDataParallel(self._net)



            self._global_step = MPI.COMM_WORLD.bcast(self._global_step, root=0)
            state_dict = MPI.COMM_WORLD.bcast(self.lr_scheduler.state_dict(), root=0)
Example #2
0
def train_loop_per_worker(config):
    import torch
    import horovod.torch as hvd

    hvd.init()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    mode = config["mode"]
    net = Net(mode).to(device)
    optimizer = torch.optim.SGD(
        net.parameters(),
        lr=config["lr"],
    )
    optimizer = hvd.DistributedOptimizer(optimizer)

    num_steps = 5
    print(hvd.size())
    np.random.seed(1 + hvd.rank())
    torch.manual_seed(1234)
    # To ensure consistent initialization across workers,
    hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    start = time.time()
    x_max = config["x_max"]
    for step in range(1, num_steps + 1):
        features = torch.Tensor(np.random.rand(1) * 2 * x_max -
                                x_max).to(device)
        if mode == "square":
            labels = sq(features)
        else:
            labels = qu(features)
        optimizer.zero_grad()
        outputs = net(features)
        loss = torch.nn.MSELoss()(outputs, labels)
        loss.backward()

        optimizer.step()
        time.sleep(0.1)
        train.report(loss=loss.item())
    total = time.time() - start
    print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.")
Example #3
0
def train(args):
    train_loader, val_loader, train_sampler, _ = prepare_data(args)
    assert (cuda.is_available() and cuda.device_count() > 0)
    net = models.__dict__[args.arch]()
    net = net.cuda()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=net.named_parameters())
    lr_scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer,
        milestones=[int(args.epoch * 0.5),
                    int(args.epoch * 0.75)],
        gamma=0.1)

    hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    best_acc = 0
    checkpoint = {}
    for epochid in range(args.epoch):
        # Horovod: set epoch to sampler for shuffling.
        train_sampler.set_epoch(epochid)
        print("==> Training Epoch %d, Learning Rate %.4f" %
              (epochid, lr_scheduler.get_lr()[0]))
        train_epoch(net, train_loader, optimizer, args)
        print('==> Validating ')
        acc = validate(net, val_loader, args)
        lr_scheduler.step()
        if acc > best_acc:
            best_acc = acc
            checkpoint = net.state_dict()

    fname = args.arch + '_' + str(best_acc) + '.pth.tar'
    os.makedirs(args.outdir, exist_ok=True)
    fname = os.path.join(args.outdir, fname)
    if hvd.rank() == 0:
        torch.save(checkpoint, fname)
    print('Best Accuracy: ', best_acc)
Example #4
0
    def setup(self, trainer: "pl.Trainer") -> None:
        self.model_to_device()

        super().setup(trainer)

        self._exit_stack = ExitStack()
        self._exit_stack.__enter__()

        if not self.lightning_module.trainer.training:
            # no need to setup optimizers
            return

        def _unpack_lightning_optimizer(opt):
            return opt._optimizer if isinstance(opt, LightningOptimizer) else opt

        optimizers = self.optimizers
        optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers]

        # Horovod: scale the learning rate by the number of workers to account for
        # increased total batch size
        for optimizer in optimizers:
            for param_group in optimizer.param_groups:
                param_group["lr"] *= self.world_size

        # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR
        lr_schedulers = self.lightning_module.trainer.lr_schedulers
        for scheduler in lr_schedulers:
            scheduler = scheduler["scheduler"]
            if isinstance(scheduler, _LRScheduler):
                scheduler.base_lrs = [lr * self.world_size for lr in scheduler.base_lrs]

        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
        hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0)
        for optimizer in optimizers:
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        self.optimizers = self._wrap_optimizers(optimizers)
        for optimizer in self.optimizers:
            # Synchronization will be performed explicitly following backward()
            self._exit_stack.enter_context(optimizer.skip_synchronize())
Example #5
0
    def __init__(self, wf=None, sampler=None, optimizer=None,
                 scheduler=None, output=None, rank=0):
        """Distributed QMC solver

        Args:
            wf (qmctorch.WaveFunction, optional): wave function. Defaults to None.
            sampler (qmctorch.sampler, optional): Sampler. Defaults to None.
            optimizer (torch.optim, optional): optimizer. Defaults to None.
            scheduler (torch.optim, optional): scheduler. Defaults to None.
            output (str, optional): hdf5 filename. Defaults to None.
            rank (int, optional): rank of he process. Defaults to 0.
        """

        SolverOrbital.__init__(self, wf, sampler,
                               optimizer, scheduler, output, rank)

        hvd.broadcast_optimizer_state(self.opt, root_rank=0)
        self.opt = hvd.DistributedOptimizer(
            self.opt, named_parameters=self.wf.named_parameters())

        self.sampler.nwalkers //= hvd.size()
        self.sampler.walkers.nwalkers //= hvd.size()
Example #6
0
 def _broadcast_state(graph,
                      optimizer,
                      step=None,
                      epoch=None,
                      graph_ema=None):
     # broadcast parameters state.
     hvd.broadcast_parameters(graph.state_dict(), root_rank=0)
     if graph_ema is not None:
         hvd.broadcast_parameters(graph_ema.state_dict(), root_rank=0)
     # broadcast optimizer state.
     if optimizer is not None:
         hvd.broadcast_optimizer_state(optimizer, root_rank=0)
     # broadcast resume_from_epoch from rank 0 (which will have
     # checkpoints) to other ranks.
     if step is not None:
         step = hvd.broadcast(torch.tensor(step),
                              root_rank=0,
                              name='resume_from_step').item()
     if epoch is not None:
         epoch = hvd.broadcast(torch.tensor(epoch),
                               root_rank=0,
                               name='resume_from_epoch').item()
     return step, epoch
Example #7
0
    def pre_dispatch(self):

        def _unpack_lightning_optimizer(opt):
            return opt._optimizer if isinstance(opt, LightningOptimizer) else opt

        optimizers = self.lightning_module.trainer.optimizers
        optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers]

        # Horovod: scale the learning rate by the number of workers to account for
        # increased total batch size
        for optimizer in optimizers:
            for param_group in optimizer.param_groups:
                param_group["lr"] *= self.world_size

        # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR
        lr_schedulers = self.lightning_module.trainer.lr_schedulers
        for scheduler in lr_schedulers:
            scheduler = scheduler["scheduler"]
            if isinstance(scheduler, _LRScheduler):
                scheduler.base_lrs = [lr * self.world_size for lr in scheduler.base_lrs]

        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
        hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0)
        for optimizer in optimizers:
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        def _filter_named_parameters(model, optimizer):
            opt_params = set([p for group in optimizer.param_groups for p in group.get("params", [])])
            return [(name, p) for name, p in model.named_parameters() if p in opt_params]

        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
        optimizers = [
            hvd.DistributedOptimizer(
                optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer)
            ) for optimizer in optimizers
        ]
        self.lightning_module.trainer.accelerator.optimizers = optimizers
def get_model(args):
    model = models.RNNModel(args.model, args.ntokens, args.emsize, args.nhid, 
                            args.nlayers, args.dropout, args.tied).to(args.device)

    # Horovod: scale learning rate by the number of GPUs.
    args.base_lr = args.base_lr * hvd.size()
    optimizer = optim.SGD(model.parameters(), lr=args.base_lr,
                          momentum=args.momentum, weight_decay=args.wd)

    if args.kfac_update_freq > 0:
        preconditioner = kfac.KFAC(
                model, lr=args.base_lr, stat_decay=args.stat_decay,
                damping=args.damping, kl_clip=args.kl_clip,
                TCov=args.kfac_cov_update_freq,
                TInv=args.kfac_update_freq,
                diag_blocks=args.diag_blocks,
                diag_warmup=args.diag_warmup)
    else:
         preconditioner = None

    optimizer = hvd.DistributedOptimizer(
            optimizer, named_parameters=model.named_parameters(),
            compression=hvd.Compression.none, op=hvd.Average)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    lrs = create_lr_schedule(hvd.size(), args.warmup_epochs, args.lr_decay, alpha=0.25)
    lr_schedules = [LambdaLR(optimizer, lrs)]
    if preconditioner is not None:
        lr_schedules.append(LambdaLR(preconditioner, lrs))

    criterion = nn.NLLLoss()

    return model, optimizer, preconditioner, lr_schedules, lrs, criterion
Example #9
0
    def pre_dispatch(self):

        if not self.lightning_module.trainer.training:
            # no need to setup optimizers
            return

        def _unpack_lightning_optimizer(opt):
            return opt._optimizer if isinstance(opt,
                                                LightningOptimizer) else opt

        optimizers = self.lightning_module.trainer.optimizers
        optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers]

        # Horovod: scale the learning rate by the number of workers to account for
        # increased total batch size
        for optimizer in optimizers:
            for param_group in optimizer.param_groups:
                param_group["lr"] *= self.world_size

        # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR
        lr_schedulers = self.lightning_module.trainer.lr_schedulers
        for scheduler in lr_schedulers:
            scheduler = scheduler["scheduler"]
            if isinstance(scheduler, _LRScheduler):
                scheduler.base_lrs = [
                    lr * self.world_size for lr in scheduler.base_lrs
                ]

        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
        hvd.broadcast_parameters(self.lightning_module.state_dict(),
                                 root_rank=0)
        for optimizer in optimizers:
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        self.lightning_module.trainer.accelerator.optimizers = self._wrap_optimizers(
            optimizers)
Example #10
0
 def _init_horovod_setting(self):
     """Init horovod setting."""
     self.is_chief = True
     # SR
     hvd.broadcast_parameters(self.model.netSR.state_dict(), root_rank=0)
     hvd.broadcast_optimizer_state(self.model.optimizer_SR, root_rank=0)
     # G F
     hvd.broadcast_parameters(self.model.netG.state_dict(), root_rank=0)
     hvd.broadcast_parameters(self.model.netF.state_dict(), root_rank=0)
     hvd.broadcast_optimizer_state(self.model.optimizer_G, root_rank=0)
     # D_X
     hvd.broadcast_parameters(self.model.netD_X.state_dict(), root_rank=0)
     hvd.broadcast_optimizer_state(self.model.optimizer_D_X, root_rank=0)
     # D_Y
     hvd.broadcast_parameters(self.model.netD_Y.state_dict(), root_rank=0)
     hvd.broadcast_optimizer_state(self.model.optimizer_D_Y, root_rank=0)
     if hvd.rank() != 0:
         self.is_chief = False
     else:
         self.is_chief = True
Example #11
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    # num_replicas=hvd.size() is added in case Horovod is used
    train_sampler = RandomSampler(
        train_dataset) if hvd.size() == 0 else DistributedSampler(
            train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
        logger.info(
            "+++++++++++++++ (if args.max_steps > 0) args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps)1 t_total = %d",
            t_total)
    else:
        t_total = len(train_dataloader) // (args.gradient_accumulation_steps *
                                            args.num_train_epochs)
        logger.info(
            "++++++++++++++++&&&&&& else... len(train_dataloader) = %d",
            len(train_dataloader))
        logger.info(
            "++++++++++++++++&&&&&& else... args.gradient_accumulation_steps = %d",
            args.gradient_accumulation_steps)
        logger.info(
            "++++++++++++++++&&&&&& else... args.num_train_epochs = %d",
            args.num_train_epochs)
        logger.info(
            "++++++++++++++++ else (t_total = len(train_dataloader) // (args.gradient_accumulation_steps * args.num_train_epochs) t_total = %d",
            t_total)

    # Prepare optimizer and schedule (linear warmup and decay)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    # Multiply the learning rate by hvd.size()
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate * hvd.size(),
                      eps=args.adam_epsilon)
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters())

    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    logger.info("+++++++++++++++++++ test test test +++++++ hvd.rank() = %d",
                hvd.rank())

    #In case of using GPU, Horovod: (optional) compression algorithm.
    #compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # In case of using GPU, Horovod: wrap optimizer with DistributedOptimizer.
    #optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Train!
    logger.info("***** Running training -- hvd.rank() = %d *****", hvd.rank())
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    logger.info(
        "++++++++++++++++ else... len(train_dataloader) = %d  & hvd.rank() =%d",
        len(train_dataloader), hvd.rank())
    logger.info(
        "++++++++++++++++ else... args.gradient_accumulation_steps = %d",
        args.gradient_accumulation_steps)
    logger.info("++++++++++++++++ else... args.num_train_epochs = %d",
                args.num_train_epochs)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            #if step % hvd.size() == 0:
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else
                None,  # XLM don't use segment_ids
                'labels':
                batch[3]
            }
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    if hvd.rank() == 0:
                        output_dir = os.path.join(
                            args.output_dir,
                            'checkpoint-{}'.format(global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        model_to_save = model.module if hasattr(
                            model, 'module'
                        ) else model  # Take care of distributed/parallel training
                        model_to_save.save_pretrained(output_dir)
                        torch.save(
                            args, os.path.join(output_dir,
                                               'training_args.bin'))
                        logger.info("Saving model checkpoint to %s",
                                    output_dir)
            # adding the calculation for hvd multi worker iteration modification
            if (args.max_steps > 0 and global_step > args.max_steps):
                epoch_iterator.close()
                break
        if (args.max_steps > 0 and global_step > args.max_steps):
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #12
0
    def train(serialized_model, optimizer_cls, model_opt_state_serialized,
              train_rows, val_rows, avg_row_size):
        from petastorm import make_batch_reader
        from petastorm.pytorch import DataLoader
        import torch
        import horovod.torch as hvd

        # Deserializing objects
        model_opt_state = torch.load(model_opt_state_serialized)
        model = deserialize(serialized_model)

        if loss_fns_pre_train:
            loss_fns = loss_fns_pre_train
        if loss_constructors:
            local_vars = locals()
            loss_fns = [
                loss_constructor(**local_vars)
                for loss_constructor in loss_constructors
            ]

        # Horovod: initialize library.
        hvd.init()

        if not user_shuffle_buffer_size:
            shuffle_buffer_size = \
                calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size())
        else:
            shuffle_buffer_size = user_shuffle_buffer_size

        cuda_available = torch.cuda.is_available()
        if cuda_available:
            # Horovod: pin GPU to local rank.
            torch.cuda.set_device(hvd.local_rank())
            # Move model to GPU.
            model.cuda()

        # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of
        # objects as their identity and therefore it cannot be serialized and then
        # deserialized. The deserialized optimizer object stores the names of the parameters
        # with their old memory addresses but in reality those are different than the
        # reconstructed deserialized object and that creates problem.
        # Learning rate is a required parameters in SGD optimizer. It will be overridden with
        # load_state_dict.
        optimizer = optimizer_cls(model.parameters(), lr=1)
        optimizer_state = model_opt_state['optimizer']

        if last_checkpoint_state is not None:
            model.load_state_dict(last_checkpoint_state['model'])
            optimizer.load_state_dict(last_checkpoint_state['optimizer'])
        else:
            # scale the learning rate with the number of horovod workers
            for i in range(len(optimizer_state['param_groups'])):
                optimizer_state['param_groups'][i]['lr'] = \
                    optimizer_state['param_groups'][i]['lr'] * hvd.size()

            optimizer.load_state_dict(optimizer_state)

        # Horovod: broadcast parameters & optimizer state.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)

        for group in optimizer.param_groups:
            for p in group['params']:
                if id(p) not in optimizer.state_dict()['state']:
                    p.grad = p.data.new(p.size()).zero_()
        optimizer.step()
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        dist_optimizer_args = dict(optimizer=optimizer,
                                   named_parameters=model.named_parameters())
        if gradient_compression:
            # Pass the compression arg only if it is specified by the user.
            dist_optimizer_args['compression'] = gradient_compression
        # Horovod: wrap optimizer with DistributedOptimizer.
        optimizer = hvd.DistributedOptimizer(**dist_optimizer_args)

        # This function takes the current optimizer and constructs a new optimizer with the
        # same state except with learning rate scaled down with the number of horovod workers.
        # This is important the retraining of the model. User may retrain the model with
        # different number of workers and we need the raw learning rate to adjust with the
        # new number of workers.

        schema_fields = feature_columns + label_columns
        if sample_weight_col:
            schema_fields.append(sample_weight_col)

        if train_steps_per_epoch is None:
            steps_per_epoch = int(
                math.ceil(float(train_rows) / batch_size / hvd.size()))
        else:
            steps_per_epoch = train_steps_per_epoch

        with remote_store.get_local_output_dir() as run_output_dir:
            logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir)
            log_writer = SummaryWriter(logs_dir) if hvd.rank() == 0 else None
            ckpt_file = os.path.join(run_output_dir,
                                     remote_store.checkpoint_filename)

            def save_checkpoint():
                model.cpu()
                optimizer_with_scaled_down_lr = \
                    get_optimizer_with_unscaled_lr(hvd, optimizer, optimizer_cls, model)
                state = {
                    'model': model.state_dict(),
                    'optimizer': optimizer_with_scaled_down_lr.state_dict(),
                }
                torch.save(state, ckpt_file)
                if cuda_available:
                    model.cuda()

            # Petastorm: read data from the store with the correct shard for this rank
            # setting num_epochs=None will cause an infinite iterator
            # and enables ranks to perform training and validation with
            # unequal number of samples
            with make_batch_reader(
                    remote_store.train_data_path,
                    num_epochs=None,
                    cur_shard=hvd.rank(),
                    shard_count=hvd.size(),
                    hdfs_driver=PETASTORM_HDFS_DRIVER,
                    schema_fields=schema_fields) as train_reader:
                with make_batch_reader(remote_store.val_data_path,
                                       num_epochs=None,
                                       cur_shard=hvd.rank(),
                                       shard_count=hvd.size(),
                                       hdfs_driver=PETASTORM_HDFS_DRIVER,
                                       schema_fields=schema_fields) \
                        if should_validate else empty_batch_reader() as val_reader:

                    train_loader = DataLoader(
                        train_reader,
                        batch_size=batch_size,
                        shuffling_queue_capacity=shuffle_buffer_size)
                    train_loader_iter = iter(train_loader)

                    def prepare_batch(row):
                        inputs = [
                            prepare_np_data(row[col].float(), col,
                                            metadata).reshape(shape) for col,
                            shape in zip(feature_columns, input_shapes)
                        ]
                        labels = [
                            prepare_np_data(row[col].float(), col, metadata)
                            for col in label_columns
                        ]

                        sample_weights = row.get(sample_weight_col, None)
                        if cuda_available:
                            inputs = [input.cuda() for input in inputs]
                            labels = [label.cuda() for label in labels]
                        return inputs, labels, sample_weights

                    def transform_outputs(outputs, labels):
                        if type(outputs) != tuple and type(outputs) != list:
                            outputs = [outputs]

                        # reshape labels to match the output shape of the model
                        if hasattr(outputs[0], 'shape'):
                            labels = [
                                label.reshape(output.shape)
                                if output.shape.numel() == label.shape.numel()
                                else label
                                for label, output in zip(labels, outputs)
                            ]
                        return outputs, labels

                    def aggregate_metrics(stage, epoch, loss,
                                          metric_value_groups):
                        all_metric_groups_values = get_metric_avgs(
                            metric_value_groups)
                        if remote_store.saving_runs:
                            write_metrics_summary(stage, epoch, loss,
                                                  all_metric_groups_values,
                                                  log_writer)
                        return {
                            loss.name: loss.avg.item(),
                            'all_metrics': all_metric_groups_values
                        }

                    def loss_fn(outputs, labels, sample_weights):
                        loss = calculate_loss(outputs, labels, loss_weights,
                                              loss_fns, sample_weights)
                        return loss

                    def print_metrics(batch_idx, loss, metric_value_groups,
                                      phase):
                        if user_verbose > 0 and hvd.rank() == 0 and \
                                batch_idx % METRIC_PRINT_FREQUENCY == 0:
                            print(
                                "epoch:\t{epoch}\tstep\t{batch_idx}:\t{metrics}"
                                .format(epoch=epoch,
                                        batch_idx=batch_idx,
                                        metrics=aggregate_metrics(
                                            phase, epoch, loss,
                                            metric_value_groups)))

                    def _train(epoch):
                        model.train()
                        train_loss = metric_cls('loss', hvd)
                        metric_value_groups = construct_metric_value_holders(
                            metric_cls, metric_fn_groups, label_columns, hvd)

                        # iterate on one epoch
                        for batch_idx in range(steps_per_epoch):
                            row = next(train_loader_iter)
                            inputs, labels, sample_weights = prepare_batch(row)
                            outputs, loss = train_minibatch(
                                model, optimizer, transform_outputs, loss_fn,
                                inputs, labels, sample_weights)
                            update_metrics(metric_value_groups, outputs,
                                           labels)
                            train_loss.update(loss)
                            print_metrics(batch_idx, train_loss,
                                          metric_value_groups, 'train')

                        return aggregate_metrics('train', epoch, train_loss,
                                                 metric_value_groups)

                    if should_validate:
                        val_loader = DataLoader(val_reader,
                                                batch_size=batch_size)
                        val_loader_iter = iter(val_loader)
                        if validation_steps_per_epoch is None:
                            validation_steps = int(
                                math.ceil(
                                    float(val_rows) / batch_size / hvd.size()))
                        else:
                            validation_steps = validation_steps_per_epoch

                        def _validate(epoch):
                            model.eval()
                            val_loss = metric_cls('loss', hvd)

                            metric_value_groups = construct_metric_value_holders(
                                metric_cls, metric_fn_groups, label_columns,
                                hvd)

                            # iterate on one epoch
                            for batch_idx in range(validation_steps):
                                row = next(val_loader_iter)
                                inputs, labels, sample_weights = prepare_batch(
                                    row)

                                outputs = model(*inputs)
                                outputs, labels = transform_outputs(
                                    outputs, labels)

                                loss = calculate_loss(outputs, labels,
                                                      loss_weights, loss_fns,
                                                      sample_weights)
                                val_loss.update(loss)
                                update_metrics(metric_value_groups, outputs,
                                               labels)
                                print_metrics(batch_idx, val_loss,
                                              metric_value_groups, 'val')
                            return aggregate_metrics('val', epoch, val_loss,
                                                     metric_value_groups)

                    history = []
                    for epoch in range(epochs):
                        epoch_metrics = {
                            'epoch': epoch,
                            'train': _train(epoch)
                        }

                        if should_validate:
                            epoch_metrics['validation'] = _validate(epoch)

                        if user_verbose > 0:
                            print(epoch_metrics)

                        history.append(epoch_metrics)
                        if hvd.rank() == 0:
                            # Save model after every epoch
                            save_checkpoint()
                            if remote_store.saving_runs:
                                remote_store.sync(run_output_dir)

            if hvd.rank() == 0:
                best_checkpoint = torch.load(ckpt_file)
                serialized_checkpoint = io.BytesIO()
                torch.save(best_checkpoint, serialized_checkpoint)
                serialized_checkpoint.seek(0)
                return history, serialized_checkpoint
Example #13
0
    def test_broadcast_state(self):
        hvd.init()

        N, D_in, H, D_out = 64, 100, 10, 10
        x = torch.autograd.Variable(torch.randn(N, D_in), requires_grad=True)
        y = torch.autograd.Variable(torch.randn(N, D_out), requires_grad=False)

        def create_model(create_opt):
            model = torch.nn.Sequential(
                torch.nn.Linear(D_in, H),
                torch.nn.ReLU(),
                torch.nn.Linear(H, D_out),
            )

            optimizer = create_opt(model)
            optimizer = hvd.DistributedOptimizer(
                optimizer, named_parameters=model.named_parameters())

            return model, optimizer

        def get_model_param_values(model):
            params = sorted(model.state_dict().items())
            return [(k, v.clone()) for k, v in params]

        def get_optimizer_param_values(optimizer):
            results = []
            state_dict = optimizer.state_dict()
            for group in state_dict['param_groups']:
                for param_id in group['params']:
                    params = sorted(state_dict['state'][param_id].items())
                    for k, v in params:
                        results.append(
                            (k, v.clone() if torch.is_tensor(v) else v))
            return results

        opt_params = dict(lr=0.2, momentum=0.9, weight_decay=0.1, centered=True)

        def new_optimizer(cls):
            p = {
                k: v for k, v in opt_params.items()
                if k in inspect.getargspec(cls.__init__).args
            }
            return lambda m: cls(m.parameters(), **p)

        # L-BFGS is currently unsupported, as are sparse tensors, which are
        # required by SparseAdam optimizer
        optimizers = [
            (subclass.__name__, new_optimizer(subclass))
            for subclass in torch.optim.Optimizer.__subclasses__()
            if subclass.__module__.startswith('torch.optim') and
               subclass != torch.optim.LBFGS and
               subclass != torch.optim.SparseAdam
        ]
        optimizers.sort()

        for opt_name, create_opt in optimizers:
            model, optimizer = create_model(create_opt)
            y_pred = model(x)
            loss = F.mse_loss(y_pred, y, size_average=False)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            model_param_values = get_model_param_values(model)
            for name, model_param_value in model_param_values:
                hvd.broadcast_(model_param_value, root_rank=0)

            opt_param_values_updated = []
            opt_param_values = get_optimizer_param_values(optimizer)
            for name, opt_param_value in opt_param_values:
                is_tensor = torch.is_tensor(opt_param_value)
                if not is_tensor:
                    t = type(opt_param_value)
                    opt_param_value = torch.Tensor([opt_param_value])
                hvd.broadcast_(opt_param_value, root_rank=0)
                if not is_tensor:
                    opt_param_value = t(opt_param_value.numpy()[0])
                opt_param_values_updated.append((name, opt_param_value))
            opt_param_values = opt_param_values_updated

            if hvd.rank() == 0:
                state = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                }
                _, fname = tempfile.mkstemp('.pt')
                torch.save(state, fname)

            model, optimizer = create_model(create_opt)
            if hvd.rank() == 0:
                checkpoint = torch.load(fname)
                model.load_state_dict(checkpoint['model'])
                optimizer.load_state_dict(checkpoint['optimizer'])
                os.remove(fname)

            hvd.broadcast_parameters(model.state_dict(), root_rank=0)
            model_param_value_after = get_model_param_values(model)
            for before, after in zip(model_param_values,
                                     model_param_value_after):
                name, model_param_value = before
                name_after, model_param_value_after = after
                self.assertEqual(name, name_after)
                self.assertEqual(type(model_param_value),
                                 type(model_param_value_after))
                self.assertTrue(
                    (model_param_value == model_param_value_after).all())

            hvd.broadcast_optimizer_state(optimizer, root_rank=0)
            self.assertEqual(len(optimizer.state_dict()['state'].values()), 4)

            opt_param_values_after = get_optimizer_param_values(optimizer)
            for before, after in zip(opt_param_values, opt_param_values_after):
                name, opt_param_value = before
                name_after, opt_param_value_after = after
                self.assertEqual(name, name_after)
                self.assertEqual(type(opt_param_value),
                                 type(opt_param_value_after))
                if torch.is_tensor(opt_param_value):
                    self.assertTrue(
                        (opt_param_value == opt_param_value_after).all())
                else:
                    self.assertEqual(opt_param_value, opt_param_value_after)
Example #14
0
    def __init__(self, minigan_args):
        if (hvd.rank() == 0):
            print('Hello minigan init\n')

        self.minigan_args = minigan_args

        self.minigan_args.output_interimages_dir = \
                self.minigan_args.output_dir + "/inter_%ss" % (self.minigan_args.dataset)

        # Load datasets
        self.load_data()

        if (hvd.rank() == 0):
            print('\n---LOAD DATA DONE---\n')

        # 2D miniGAN
        if (self.minigan_args.dim_mode == 2):
            self.generator = Generator2d(minigan_args)
            self.discriminator = Discriminator2d(minigan_args)
        # 3D miniGAN
        elif (self.minigan_args.dim_mode == 3):
            self.generator = Generator3d(minigan_args)
            self.discriminator = Discriminator3d(minigan_args)
        else:
            raise ValueError('\'dim_mode\' must be {2} or {3}.')

        self.generator = self.generator.float()
        self.discriminator = self.discriminator.float()

        self.generator.apply(weight_init)
        self.discriminator.apply(weight_init)

        if (hvd.rank() == 0):
            print(self.generator)
            print(self.discriminator)

        # Metrics and Loss
        self.loss_fn = nn.BCELoss()

        self.real_label = 1.0 - self.minigan_args.soft_label
        self.fake_label = self.minigan_args.soft_label

        self.gen_optim = optim.Adam( \
                self.generator.parameters(), \
                lr=self.minigan_args.gen_lr, \
                betas=(self.minigan_args.gen_beta1, .999))
        self.disc_optim = optim.Adam( \
                self.discriminator.parameters(), \
                lr=self.minigan_args.disc_lr, \
                betas=(self.minigan_args.disc_beta1, .999))

        if self.minigan_args.cuda:
            self.generator.cuda()
            self.discriminator.cuda()

        hvd.broadcast_parameters(self.generator.state_dict(), root_rank=0)
        hvd.broadcast_parameters(self.discriminator.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(self.gen_optim, root_rank=0)
        hvd.broadcast_optimizer_state(self.disc_optim, root_rank=0)

        self.compression = \
                hvd.Compression.fp16 if self.minigan_args.fp16_allreduce else hvd.Compression.none

        self.gen_optim = hvd.DistributedOptimizer(self.gen_optim, \
                named_parameters=self.generator.named_parameters(), \
                compression = self.compression)

        self.disc_optim = hvd.DistributedOptimizer(self.disc_optim, \
                named_parameters=self.discriminator.named_parameters(), \
                compression = self.compression)

        #        self.tb_disc.set_model(self.discriminator)
        #        self.tb_comb_gan.set_model(self.combined_gan)

        if (self.minigan_args.profile):
            self.profile_layers(self.minigan_args.prof_steps, self.prof_images)

        if (hvd.rank() == 0):
            print('\n---NETWORK SETUP DONE---\n')
Example #15
0
def main(_):
  FLAGS.torch_only = True
  
  melt.init()
  #fit = melt.get_fit()

  FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier
  FLAGS.eval_batch_size = 512

  model_name = FLAGS.model
  model = getattr(base, model_name)() 

  model = model.cuda()

  loss_fn = nn.BCEWithLogitsLoss()

  td = text_dataset.Dataset()

  train_files = gezi.list_files('../input/train/*')
  train_ds = get_dataset(train_files, td)
  
  #kwargs = {'num_workers': 4, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()}
  #num_workers = int(16 / hvd.size())  
  num_workers = 1  # set to 1 2 min to start might just set to 0 for safe
  num_workers = 0 # 设置0 速度比1慢很多   启动都需要1分多。。
  # pin_memory 影响不大 单gpu提升速度一点点 多gpu 主要是 num_workers  影响资源占有。。有可能启动不起来
  # 多gpu pin_memory = False 反而速度更快。。
  #kwargs = {'num_workers': num_workers, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()}  
  kwargs = {'num_workers': 1, 'pin_memory': False, 'collate_fn': lele.DictPadCollate()}

  train_sampler = train_ds
  train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_ds, num_replicas=hvd.size(), rank=hvd.rank())
  
  train_dl = DataLoader(train_ds, FLAGS.batch_size, sampler=train_sampler, **kwargs)
  
  valid_files = gezi.list_files('../input/valid/*')
  valid_ds = get_dataset(valid_files, td)

  # support shuffle=False from version 1.2
  valid_sampler = torch.utils.data.distributed.DistributedSampler(
      valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False)

  # valid_sampler2 = torch.utils.data.distributed.DistributedSampler(
  #     valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False)
  
  valid_dl = DataLoader(valid_ds, FLAGS.eval_batch_size, sampler=valid_sampler, **kwargs)
  
  #valid_dl2 = DataLoader(valid_ds, FLAGS.batch_size, sampler=valid_sampler2, **kwargs)


  optimizer = optim.Adamax(model.parameters(), lr=0.1)
  #optimizer = optim.SGD(model.parameters(), lr=0.1)
  hvd.broadcast_parameters(model.state_dict(), root_rank=0)
  hvd.broadcast_optimizer_state(optimizer, root_rank=0)

  optimizer = hvd.DistributedOptimizer(optimizer,
                                       named_parameters=model.named_parameters())

  for epoch in range(2):
    train(epoch, model, loss_fn, train_dl, optimizer)
    test(model, loss_fn, valid_dl)
Example #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-data", help="data yaml file")
    parser.add_argument("-data_path",
                        default='',
                        type=str,
                        help="path of data files")
    parser.add_argument("-seed_model", help="the seed nerual network model")
    parser.add_argument("-exp_dir", help="the directory to save the outputs")
    parser.add_argument("-transform",
                        help="feature transformation matrix or mvn statistics")
    parser.add_argument("-criterion",
                        type=str,
                        choices=["mmi", "mpfe", "smbr"],
                        help="set the sequence training crtierion")
    parser.add_argument(
        "-trans_model",
        help="the HMM transistion model, used for lattice generation")
    parser.add_argument(
        "-prior_path",
        help="the prior for decoder, usually named as final.occs in kaldi setup"
    )
    parser.add_argument(
        "-den_dir",
        help="the decoding graph directory to find HCLG and words.txt files")
    parser.add_argument("-lr", type=float, help="set the learning rate")
    parser.add_argument("-ce_ratio",
                        default=0.1,
                        type=float,
                        help="the ratio for ce regularization")
    parser.add_argument("-momentum",
                        default=0,
                        type=float,
                        help="set the momentum")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-data_loader_threads",
                        default=0,
                        type=int,
                        help="number of workers for data loading")
    parser.add_argument("-max_grad_norm",
                        default=5,
                        type=float,
                        help="max_grad_norm for gradient clipping")
    parser.add_argument("-sweep_size",
                        default=100,
                        type=float,
                        help="process n hours of data per sweep (default:60)")
    parser.add_argument("-num_epochs",
                        default=1,
                        type=int,
                        help="number of training epochs (default:1)")
    parser.add_argument('-print_freq',
                        default=10,
                        type=int,
                        metavar='N',
                        help='print frequency (default: 10)')
    parser.add_argument('-save_freq',
                        default=1000,
                        type=int,
                        metavar='N',
                        help='save model frequency (default: 1000)')

    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    config['data_path'] = args.data_path

    config["sweep_size"] = args.sweep_size

    print("pytorch version:{}".format(th.__version__))

    with open(args.data) as f:
        data = yaml.safe_load(f)
        config["source_paths"] = [j for i, j in data['clean_source'].items()]

    print("Experiment starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    # Initialize Horovod
    hvd.init()

    th.cuda.set_device(hvd.local_rank())

    print("Run experiments with world size {}".format(hvd.size()))

    dataset = SpeechDataset(config)
    transform = None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)
            dataset.transform = transform

    train_dataloader = SeqDataloader(dataset,
                                     batch_size=args.batch_size,
                                     num_workers=args.data_loader_threads,
                                     distributed=True,
                                     test_only=False)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(train_dataloader)))

    if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    model.cuda()

    # setup the optimizer
    optimizer = th.optim.SGD(model.parameters(),
                             lr=args.lr,
                             momentum=args.momentum)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)
        state_dict = checkpoint['model']
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k[7:]  # remove 'module.' of dataparallel
            new_state_dict[name] = v
        model.load_state_dict(new_state_dict)
        print("=> loaded checkpoint '{}' ".format(args.seed_model))
    else:
        sys.stderr.write('ERROR: The model file %s does not exist!\n' %
                         (model_file))
        sys.exit(0)

    HCLG = args.den_dir + "/HCLG.fst"
    words_txt = args.den_dir + "/words.txt"
    silence_phones = args.den_dir + "/phones/silence.csl"

    if not os.path.isfile(HCLG):
        sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG))
        sys.exit(0)

    if not os.path.isfile(words_txt):
        sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' %
                         (words_txt))
        sys.exit(0)

    if not os.path.isfile(silence_phones):
        sys.stderr.write('ERROR: The silence phone file %s does not exist!\n' %
                         (silence_phones))
        sys.exit(0)
    with open(silence_phones) as f:
        silence_ids = [int(i) for i in f.readline().strip().split(':')]
        f.close()

    if os.path.isfile(args.trans_model):
        trans_model = kaldi_hmm.TransitionModel()
        with kaldi_util.io.xopen(args.trans_model) as ki:
            trans_model.read(ki.stream(), ki.binary)
    else:
        sys.stderr.write('ERROR: The trans_model %s does not exist!\n' %
                         (args.trans_model))
        sys.exit(0)

    # now we can setup the decoder
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = config["decoder_config"]["beam"]
    decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"]
    decoder_opts.max_active = config["decoder_config"]["max_active"]
    acoustic_scale = config["decoder_config"]["acoustic_scale"]
    decoder_opts.determinize_lattice = False  #To produce raw state-level lattice instead of compact lattice
    asr_decoder = MappedLatticeFasterRecognizer.from_files(
        args.trans_model,
        HCLG,
        words_txt,
        acoustic_scale=acoustic_scale,
        decoder_opts=decoder_opts)

    prior = kaldi_util.io.read_matrix(args.prior_path).numpy()
    log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float)

    model.train()
    for epoch in range(args.num_epochs):

        run_train_epoch(model, optimizer, log_prior.cuda(), train_dataloader,
                        epoch, asr_decoder, trans_model, silence_ids, args)

        # save model
        if hvd.rank() == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            checkpoint['epoch'] = epoch
            output_file = args.exp_dir + '/model.se.' + str(epoch) + '.tar'
            th.save(checkpoint, output_file)
Example #17
0
def main(args):
    logfilename = 'convergence_cifar10_{}_kfac{}_gpu{}_bs{}_{}_lr{}_sr{}_wp{}.log'.format(
        args.model, args.kfac_update_freq, hvd.size(), args.batch_size,
        args.kfac_name, args.base_lr, args.sparse_ratio, args.warmup_epochs)
    if hvd.rank() == 0:
        wandb.init(project='kfac',
                   entity='hkust-distributedml',
                   name=logfilename,
                   config=args)

    logfile = './logs/' + logfilename
    #logfile = './logs/sparse_cifar10_{}_kfac{}_gpu{}_bs{}.log'.format(args.model, args.kfac_update_freq, hvd.size(), args.batch_size)
    #logfile = './logs/cifar10_{}_kfac{}_gpu{}_bs{}.log'.format(args.model, args.kfac_update_freq, hvd.size(), args.batch_size)
    hdlr = logging.FileHandler(logfile)
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.info(args)

    torch.manual_seed(args.seed)
    verbose = True if hvd.rank() == 0 else False
    args.verbose = 1 if hvd.rank() == 0 else 0

    if args.cuda:
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)

    torch.backends.cudnn.benchmark = True

    args.log_dir = os.path.join(
        args.log_dir, "cifar10_{}_kfac{}_gpu_{}_{}".format(
            args.model, args.kfac_update_freq, hvd.size(),
            datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')))
    #os.makedirs(args.log_dir, exist_ok=True)
    #log_writer = SummaryWriter(args.log_dir) if verbose else None
    log_writer = None

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 8, 'pin_memory': True} if args.cuda else {}

    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    download = True if hvd.local_rank() == 0 else False
    if not download: hvd.allreduce(torch.tensor(1), name="barrier")
    train_dataset = datasets.CIFAR10(root=args.dir,
                                     train=True,
                                     download=download,
                                     transform=transform_train)
    test_dataset = datasets.CIFAR10(root=args.dir,
                                    train=False,
                                    download=download,
                                    transform=transform_test)
    if download: hvd.allreduce(torch.tensor(1), name="barrier")

    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    #train_loader = torch.utils.data.DataLoader(train_dataset,
    train_loader = MultiEpochsDataLoader(train_dataset,
                                         batch_size=args.batch_size *
                                         args.batches_per_allreduce,
                                         sampler=train_sampler,
                                         **kwargs)

    # Horovod: use DistributedSampler to partition the test data.
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    if args.model.lower() == "resnet20":
        model = resnet.resnet20()
    elif args.model.lower() == "resnet32":
        model = resnet.resnet32()
    elif args.model.lower() == "resnet44":
        model = resnet.resnet44()
    elif args.model.lower() == "resnet56":
        model = resnet.resnet56()
    elif args.model.lower() == "resnet110":
        model = resnet.resnet110()

    if args.cuda:
        model.cuda()

    #if verbose:
    #    summary(model, (3, 32, 32))

    criterion = nn.CrossEntropyLoss()
    args.base_lr = args.base_lr * hvd.size()
    use_kfac = True if args.kfac_update_freq > 0 else False

    optimizer = optim.SGD(model.parameters(),
                          lr=args.base_lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)

    if use_kfac:
        KFAC = kfac.get_kfac_module(args.kfac_name)
        preconditioner = KFAC(
            model,
            lr=args.base_lr,
            factor_decay=args.stat_decay,
            damping=args.damping,
            kl_clip=args.kl_clip,
            fac_update_freq=args.kfac_cov_update_freq,
            kfac_update_freq=args.kfac_update_freq,
            diag_blocks=args.diag_blocks,
            diag_warmup=args.diag_warmup,
            distribute_layer_factors=args.distribute_layer_factors,
            sparse_ratio=args.sparse_ratio)
        kfac_param_scheduler = kfac.KFACParamScheduler(
            preconditioner,
            damping_alpha=args.damping_alpha,
            damping_schedule=args.damping_schedule,
            update_freq_alpha=args.kfac_update_freq_alpha,
            update_freq_schedule=args.kfac_update_freq_schedule)

    # KFAC guarentees grads are equal across ranks before opt.step() is called
    # so if we do not use kfac we need to wrap the optimizer with horovod
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Average,
        backward_passes_per_step=args.batches_per_allreduce)

    hvd.broadcast_optimizer_state(optimizer, root_rank=0)
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    lrs = create_lr_schedule(hvd.size(), args.warmup_epochs, args.lr_decay)
    lr_scheduler = [LambdaLR(optimizer, lrs)]
    if use_kfac:
        lr_scheduler.append(LambdaLR(preconditioner, lrs))

    def train(epoch):
        model.train()
        train_sampler.set_epoch(epoch)
        train_loss = Metric('train_loss')
        train_accuracy = Metric('train_accuracy')

        if STEP_FIRST:
            for scheduler in lr_scheduler:
                scheduler.step()
            if use_kfac:
                kfac_param_scheduler.step(epoch)

    #    with tqdm(total=len(train_loader),
    #              desc='Epoch {:3d}/{:3d}'.format(epoch + 1, args.epochs),
    #              disable=not verbose) as t:
        display = 20
        avg_time = 0.0
        io_time = 0.0
        if True:
            for batch_idx, (data, target) in enumerate(train_loader):
                stime = time.time()
                if args.cuda:
                    data, target = data.cuda(non_blocking=True), target.cuda(
                        non_blocking=True)
                io_time += time.time() - stime
                optimizer.zero_grad()

                for i in range(0, len(data), args.batch_size):
                    data_batch = data[i:i + args.batch_size]
                    target_batch = target[i:i + args.batch_size]
                    output = model(data_batch)

                    loss = criterion(output, target_batch)
                    with torch.no_grad():
                        train_loss.update(loss)
                        train_accuracy.update(accuracy(output, target_batch))
                    loss.div_(math.ceil(float(len(data)) / args.batch_size))
                    loss.backward()

                optimizer.synchronize()
                if use_kfac:
                    preconditioner.step(epoch=epoch)
                with optimizer.skip_synchronize():
                    optimizer.step()

                #t.set_postfix_str("loss: {:.4f}, acc: {:.2f}%".format(
                #train_loss.avg.item(), 100*train_accuracy.avg.item()))
                #t.update(1)
                avg_time += (time.time() - stime)
                if batch_idx > 0 and batch_idx % display == 0:
                    if args.verbose:
                        logger.info(
                            "[%d][%d] train loss: %.4f, acc: %.3f, time: %.3f [io: %.3f], speed: %.3f images/s"
                            % (epoch, batch_idx, train_loss.avg.item(), 100 *
                               train_accuracy.avg.item(), avg_time / display,
                               io_time / display, args.batch_size /
                               (avg_time / display)))
                        avg_time = 0.0
                        io_time = 0.0
                if hvd.rank() == 0:
                    wandb.log({"loss": loss, "epoch": epoch})
            if args.verbose:
                logger.info("[%d] epoch train loss: %.4f, acc: %.3f" %
                            (epoch, train_loss.avg.item(),
                             100 * train_accuracy.avg.item()))

        if not STEP_FIRST:
            for scheduler in lr_scheduler:
                scheduler.step()
            if use_kfac:
                kfac_param_scheduler.step(epoch)

        if log_writer:
            log_writer.add_scalar('train/loss', train_loss.avg, epoch)
            log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)

    def test(epoch):
        model.eval()
        test_loss = Metric('val_loss')
        test_accuracy = Metric('val_accuracy')

        #with tqdm(total=len(test_loader),
        #          bar_format='{l_bar}{bar}|{postfix}',
        #          desc='             '.format(epoch + 1, args.epochs),
        #          disable=not verbose) as t:
        if True:
            with torch.no_grad():
                for i, (data, target) in enumerate(test_loader):
                    if args.cuda:
                        data, target = data.cuda(), target.cuda()
                    output = model(data)
                    test_loss.update(criterion(output, target))
                    test_accuracy.update(accuracy(output, target))
                if args.verbose:
                    logger.info("[%d][0] evaluation loss: %.4f, acc: %.3f" %
                                (epoch, test_loss.avg.item(),
                                 100 * test_accuracy.avg.item()))
                    if hvd.rank() == 0:
                        wandb.log({
                            "val top-1 acc": test_accuracy.avg.item(),
                            "epoch": epoch
                        })

                    #t.update(1)
                    #if i + 1 == len(test_loader):
                    #    t.set_postfix_str("\b\b test_loss: {:.4f}, test_acc: {:.2f}%".format(
                    #            test_loss.avg.item(), 100*test_accuracy.avg.item()),
                    #            refresh=False)

        if log_writer:
            log_writer.add_scalar('test/loss', test_loss.avg, epoch)
            log_writer.add_scalar('test/accuracy', test_accuracy.avg, epoch)

    start = time.time()

    for epoch in range(args.epochs):
        if args.verbose:
            logger.info("[%d] epoch train starts" % (epoch))
        train(epoch)
        test(epoch)

    if verbose:
        logger.info("Training time: %s",
                    str(datetime.timedelta(seconds=time.time() - start)))

    pass
Example #18
0
    def __init__(
        self,
        env,
        env_params: dict,
        log_dir: str,
        ac_kwargs: dict = {},
        seed: int = 0,
        steps_per_epoch: int = 4000,
        epochs: int = 50,
        gamma: float = 0.99,
        clip_ratio: float = 0.2,
        pi_lr: float = 3e-4,
        vf_lr: float = 1e-3,
        train_iters: int = 100,
        entropy_coeff: float = 1e-2,
        lam: float = 0.97,
        target_kl: float = 0.01,
        save_freq: int = 10,
        load_path=None,
        render_train: bool = False,
        wandb_id: Optional[str] = None,
        **kwargs,
    ):
        self.log_dir = log_dir
        self.render_dir = os.path.join(log_dir, "renders")
        self.ckpt_dir = os.path.join(log_dir, "checkpoints")
        if hvd.rank() == 0:
            os.makedirs(self.log_dir, exist_ok=True)
            os.makedirs(self.render_dir, exist_ok=True)
            os.makedirs(self.ckpt_dir, exist_ok=True)
        self.softlink = os.path.abspath(
            os.path.join(self.ckpt_dir, f"ckpt_latest.pth"))
        self.ac_params_file = os.path.join(log_dir, "ac_params.json")
        hparams = convert_json(locals())
        self.logger = EpochLogger(output_dir=self.log_dir, exp_name=wandb_id)

        if torch.cuda.is_available():
            # Horovod: pin GPU to local rank.
            dev_id = int(torch.cuda.device_count() * hvd.local_rank() /
                         hvd.local_size())
            torch.cuda.set_device(dev_id)
            device = torch.device(f"cuda:{dev_id}")
            torch.cuda.manual_seed(seed)
        else:
            device = torch.device("cpu")

        #         env_params.update({"device": device})
        self.env = env(**env_params)
        self.ac_params = {k: v for k, v in ac_kwargs.items()}
        self.ac_params.update({
            "observation_space": self.env.observation_space,
            "action_space": self.env.action_space,
            "nagents": self.env.nagents,
        })

        self.entropy_coeff = entropy_coeff
        self.entropy_coeff_decay = entropy_coeff / epochs

        # Horovod: limit # of CPU threads to be used per worker.
        torch.set_num_threads(1)

        torch.save(self.ac_params, self.ac_params_file)

        if os.path.isfile(self.softlink):
            self.logger.log("Restarting from latest checkpoint", color="red")
            load_path = self.softlink

        # Random seed
        seed += 10000 * hvd.rank()
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.nagents = self.env.nagents
        self.ac = PPOLidarActorCritic(
            self.env.observation_space,
            self.env.action_space,
            nagents=self.nagents,
            centralized=True,
            **ac_kwargs,
        )

        self.device = device

        self.pi_lr = pi_lr
        self.vf_lr = vf_lr

        self.load_path = load_path
        if load_path is not None:
            self.load_model(load_path)
        else:
            self.pi_optimizer = Adam(trainable_parameters(self.ac.pi),
                                     lr=self.pi_lr,
                                     eps=1e-8)
            self.vf_optimizer = Adam(trainable_parameters(self.ac.v),
                                     lr=self.vf_lr,
                                     eps=1e-8)

        # Sync params across processes
        hvd.broadcast_parameters(self.ac.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(self.pi_optimizer, root_rank=0)
        hvd.broadcast_optimizer_state(self.vf_optimizer, root_rank=0)
        self.ac = self.ac.to(device)
        self.move_optimizer_to_device(self.pi_optimizer)
        self.move_optimizer_to_device(self.vf_optimizer)

        if hvd.rank() == 0:
            if wandb_id is None:
                eid = (log_dir.split("/")[-2]
                       if load_path is None else load_path.split("/")[-4])
            else:
                eid = wandb_id
            wandb.init(
                name=eid,
                id=eid,
                project="Social Driving",
                resume=load_path is not None,
            )
            wandb.watch_called = False

            if "self" in hparams:
                del hparams["self"]
            wandb.config.update(hparams, allow_val_change=True)

            wandb.watch(self.ac.pi, log="all")
            wandb.watch(self.ac.v, log="all")

        # Count variables
        var_counts = tuple(
            count_vars(module) for module in [self.ac.pi, self.ac.v])
        self.logger.log(
            "\nNumber of parameters: \t pi: %d, \t v: %d\n" % var_counts,
            color="green",
        )

        # Set up experience buffer
        self.steps_per_epoch = steps_per_epoch
        self.local_steps_per_epoch = int(steps_per_epoch / hvd.size())
        self.buf = CentralizedPPOBuffer(
            self.env.observation_space[0].shape,
            self.env.observation_space[1].shape,
            self.env.action_space.shape,
            self.local_steps_per_epoch,
            gamma,
            lam,
            self.env.nagents,
            device=self.device,
        )

        self.gamma = gamma
        self.clip_ratio = clip_ratio
        self.train_iters = train_iters
        self.target_kl = target_kl
        self.epochs = epochs
        self.save_freq = save_freq
    def horovod_train(self, model):
        if torch.cuda.is_available() and self.on_gpu:
            # Horovod: pin GPU to local rank
            assert self.root_gpu == hvd.local_rank()
            torch.cuda.set_device(self.root_gpu)
            model.cuda(self.root_gpu)

        # Only show progress bar from the first worker
        self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if hvd.rank(
        ) == 0 else 0

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(
            model)

        # Horovod: scale the learning rate by the number of workers to account for
        # increased total batch size
        for optimizer in self.optimizers:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= hvd.size()

        if self.use_amp:
            # An example
            model, optimizers = model.configure_apex(amp, model,
                                                     self.optimizers,
                                                     self.amp_level)
            self.optimizers = optimizers

        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        for optimizer in self.optimizers:
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        def filter_named_parameters(model, optimizer):
            opt_params = set([
                p for group in optimizer.param_groups
                for p in group.get('params', [])
            ])
            return [(name, p) for name, p in model.named_parameters()
                    if p in opt_params]

        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
        self.optimizers = [
            hvd.DistributedOptimizer(optimizer,
                                     named_parameters=filter_named_parameters(
                                         model, optimizer))
            for optimizer in self.optimizers
        ]

        # Update logger rank info from Horovod to avoid race conditions from  different ranks
        # creating directories / writing files in the same locations.
        self.proc_rank = hvd.rank()
        rank_zero_only.rank = self.proc_rank

        with ExitStack() as stack:
            for optimizer in self.optimizers:
                # Synchronization will be performed explicitly following backward()
                stack.enter_context(optimizer.skip_synchronize())

            self.run_pretrain_routine(model)
Example #20
0
    def init_training(self):
        model = self.elements["model"]
        start_epoch = self.params["start_epoch"]
        exist_model = self.params["exist_model"]
        model_dir = self.params["model_dir"]
        model_blueprint = self.params["model_blueprint"]
        suffix = self.params["suffix"]

        if start_epoch <= 0 and utils.is_main_training():
            model_creation = model.get_model_creation()
            utils.write_nnet_config(model_blueprint, model_creation,
                                    "{0}/config/nnet.config".format(model_dir))

        ## Recover checkpoint | Tansform learning | Initialize parametes
        if start_epoch > 0:
            # This train_stage is equal to number of completed epoch
            if utils.is_main_training():
                logger.info(
                    "Recover training from {0} epoch.".format(start_epoch))
            model.load_state_dict(
                torch.load('{0}/{1}.{2}'.format(model_dir, start_epoch,
                                                suffix),
                           map_location="cpu"))
        elif os.path.exists(exist_model):
            if utils.is_main_training():
                logger.info(
                    "Use {0} as the initial model to start transform-training."
                    .format(exist_model))
            model.load_transform_state_dict(
                torch.load(exist_model, map_location="cpu"))
        else:
            # Just use the raw initial model or initialize it again by some initial functions here
            pass  # Now, it means use the raw initial model

        if utils.use_horovod():
            import horovod.torch as hvd

            # Broadcast parameters from rank 0 to all other processes.
            hvd.broadcast_parameters(self.elements["model"].state_dict(),
                                     root_rank=0)

            # For optimizer wrapper such as lookahead.
            if getattr(self.elements["optimizer"], "optimizer",
                       None) is not None:
                raise TypeError(
                    "Do not support using lookahead with horovod now.")
            else:
                # Broadcast optimizer state.
                hvd.broadcast_optimizer_state(self.elements["optimizer"],
                                              root_rank=0)
                self.elements["optimizer"] = hvd.DistributedOptimizer(
                    self.elements["optimizer"],
                    named_parameters=self.elements["model"].named_parameters())

        ## Select device
        model = self.select_device()

        # Original model is built in libs.nnet.framework.TopVirtualNnet, and it is not available after
        # wrapped by DistributedDataParallel. So, to call functions of TopVirtualNnet conveniently, the
        # self.elements["model_forward"] is set here to name DistributedDataParallel.
        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
            self.elements["model"] = model.module
            self.elements["model_forward"] = model
Example #21
0
def train_and_eval(conf,
                   val_ratio,
                   val_fold,
                   save_path,
                   only_eval,
                   reporter=None,
                   metric='test'):

    writer = get_tb_writer()

    # region conf vars
    conf_dataset = conf['dataset']
    dataroot = conf_dataset['dataroot']
    horovod = conf['common']['horovod']
    checkpoint_freq = conf['common']['checkpoint']['freq']
    conf_loader = conf['autoaug']['loader']
    conf_model = conf['autoaug']['model']
    ds_name = conf_dataset['name']
    aug = conf_loader['aug']
    cutout = conf_loader['cutout']
    batch_size = conf_loader['batch']
    max_batches = conf_dataset['max_batches']
    epochs = conf_loader['epochs']
    conf_model = conf['autoaug']['model']
    conf_opt = conf['autoaug']['optimizer']
    conf_lr_sched = conf['autoaug']['lr_schedule']
    n_workers = conf_loader['n_workers']
    # endregion

    # initialize horovod
    # TODO: move to common init
    if horovod:
        import horovod.torch as hvd
        hvd.init()
        device = torch.device('cuda', hvd.local_rank())
        torch.cuda.set_device(device)

    if not reporter:
        reporter = lambda **kwargs: 0

    # get dataloaders with transformations and splits applied
    train_dl, valid_dl, test_dl = get_dataloaders(ds_name,
                                                  batch_size,
                                                  dataroot,
                                                  aug,
                                                  cutout,
                                                  load_train=True,
                                                  load_test=True,
                                                  val_ratio=val_ratio,
                                                  val_fold=val_fold,
                                                  horovod=horovod,
                                                  n_workers=n_workers,
                                                  max_batches=max_batches)

    # create a model & an optimizer
    model = get_model(conf_model,
                      num_class(ds_name),
                      data_parallel=(not horovod))

    # select loss function and optimizer
    lossfn = nn.CrossEntropyLoss()
    optimizer = ml_utils.create_optimizer(conf_opt, model.parameters())

    # distributed optimizer if horovod is used
    is_master = True
    if horovod:
        optimizer = hvd.DistributedOptimizer(
            optimizer, named_parameters=model.named_parameters())
        # issue : https://github.com/horovod/horovod/issues/1099
        optimizer._requires_update = set()
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)
        if hvd.rank() != 0:
            is_master = False
    logger.debug('is_master=%s' % is_master)

    # select LR schedule
    scheduler = ml_utils.create_lr_scheduler(conf_lr_sched, epochs, optimizer,
                                             len(train_dl))

    result = OrderedDict()
    epoch_start = 1
    # if model available from previous checkpount then load it
    if save_path and os.path.exists(save_path):
        logger.info('%s checkpoint found. loading...' % save_path)
        data = torch.load(save_path)

        # when checkpointing we do add 'model' key so other cases are special cases
        if 'model' in data or 'state_dict' in data:
            key = 'model' if 'model' in data else 'state_dict'
            logger.info('checkpoint epoch@%d' % data['epoch'])

            # TODO: do we need change here?
            if not isinstance(model, DataParallel):
                # for non-dataparallel models, remove default 'module.' prefix
                model.load_state_dict({k.replace('module.', ''): \
                    v for k, v in data[key].items()})
            else:
                # for dataparallel models, make sure 'module.' prefix exist
                model.load_state_dict({k if 'module.' in k \
                    else 'module.'+k: v for k, v in data[key].items()})

            # load optimizer
            optimizer.load_state_dict(data['optimizer'])

            # restore epoch count
            if data['epoch'] < epochs:
                epoch_start = data['epoch']
            else:
                # epochs finished, switch to eval mode
                only_eval = False

        else:
            model.load_state_dict({k: v for k, v in data.items()})
        del data
    else:
        logger.info('model checkpoint does not exist at "%s". skip \
            to pretrain weights...' % save_path)
        only_eval = False  # we made attempt to load checkpt but as it does not exist, switch to train mode

    # if eval only then run model on train, test and val sets
    if only_eval:
        logger.info('evaluation only+')
        model.eval()
        rs = dict()  # stores metrics for each set
        rs['train'] = run_epoch(conf,
                                logger,
                                model,
                                train_dl,
                                lossfn,
                                None,
                                split_type='train',
                                epoch=0)
        if valid_dl:
            rs['valid'] = run_epoch(conf,
                                    logger,
                                    model,
                                    valid_dl,
                                    lossfn,
                                    None,
                                    split_type='valid',
                                    epoch=0)
        rs['test'] = run_epoch(conf,
                               logger,
                               model,
                               test_dl,
                               lossfn,
                               None,
                               split_type='test',
                               epoch=0)

        for key, setname in itertools.product(['loss', 'top1', 'top5'],
                                              ['train', 'valid', 'test']):
            result['%s_%s' % (key, setname)] = rs[setname][key]

        result['epoch'] = 0
        return result

    # train loop
    best_top1, best_valid_loss = 0, 10.0e10
    max_epoch = epochs
    for epoch in range(epoch_start, max_epoch + 1):
        if horovod:
            trainsampler.set_epoch(epoch)

        # run train epoch and update the model
        model.train()
        rs = dict()
        rs['train'] = run_epoch(conf,
                                logger,
                                model,
                                train_dl,
                                lossfn,
                                optimizer,
                                split_type='train',
                                epoch=epoch,
                                verbose=is_master,
                                scheduler=scheduler)
        if scheduler[0]:
            scheduler[0].step()

        model.eval()

        # check for nan loss
        if math.isnan(rs['train']['loss']):
            raise Exception('train loss is NaN.')

        # collect metrics on val and test set, checkpoint
        if epoch % checkpoint_freq == 0 or epoch == max_epoch:
            if valid_dl:
                rs['valid'] = run_epoch(conf,
                                        logger,
                                        model,
                                        valid_dl,
                                        lossfn,
                                        None,
                                        split_type='valid',
                                        epoch=epoch,
                                        verbose=is_master)
            rs['test'] = run_epoch(conf,
                                   logger,
                                   model,
                                   test_dl,
                                   lossfn,
                                   None,
                                   split_type='test',
                                   epoch=epoch,
                                   verbose=is_master)

            # TODO: is this good enough condition?
            if rs[metric]['loss'] < best_valid_loss or rs[metric][
                    'top1'] > best_top1:
                best_top1 = rs[metric]['top1']
                best_valid_loss = rs[metric]['loss']
                for key, setname in itertools.product(
                    ['loss', 'top1', 'top5'], ['train', 'valid', 'test']):
                    result['%s_%s' % (key, setname)] = rs[setname][key]
                result['epoch'] = epoch

                writer.add_scalar('best_top1/valid', rs['valid']['top1'],
                                  epoch)
                writer.add_scalar('best_top1/test', rs['test']['top1'], epoch)

                reporter(loss_valid=rs['valid']['loss'],
                         top1_valid=rs['valid']['top1'],
                         loss_test=rs['test']['loss'],
                         top1_test=rs['test']['top1'])

                # save checkpoint
                if is_master and save_path:
                    logger.info('save model@%d to %s' % (epoch, save_path))
                    torch.save(
                        {
                            'epoch': epoch,
                            'log': {
                                'train': rs['train'].get_dict(),
                                'valid': rs['valid'].get_dict(),
                                'test': rs['test'].get_dict(),
                            },
                            'optimizer': optimizer.state_dict(),
                            'model': model.state_dict()
                        }, save_path)

    del model

    result['top1_test'] = best_top1
    return result
Example #22
0
def _train_rnn(dataset,
               index,
               hidden_size,
               n_layers,
               bidirectional,
               classifier,
               n_epochs_max,
               batch_size,
               n_workers,
               file_name,
               root_dir=ROOT_DIR,
               lr=0.001,
               betas=(0.9, 0.999),
               opt_level="O0",
               seed=42,
               log_interval=10):
    '''constructs and trains neural networks given the dataset instance and network structure

    inputs
    ------
    dataset - instance of dataset class inherited from Dataset class
        data should contain keys "data" and "labels"
    index - dict with keys "train" and "val" whose values are list-like indices
    hidden_size - size of hidden state
    n_layers - number of recurrent layers
    bidirectional - if True, becomes a bidirectional LSTM
    classifier: boolean indicating whether it's a classifier or regressor.
    n_epochs_max - maximum number of epochs to run. can be terminated by KeyboardInterrupt
    batch_size - batch size for training
    n_workers - int indicating number of workers when creating DataLoader instance
    file_name - name of file that contains the empty meta data
    root_dir - root directory of the meta data file
    lr - float type learning rate
    betas - tuple of floats indicating betas arguments in Adam optimizer
    opt_level - optimization level
    seed - random seed
    log_interval - how many batches to wait before logging training status

    outputs
    -------
    saves data into given file
    '''
    #hvd.init() # initialize horovod
    torch.manual_seed(seed)  # FIXME: necessary here?
    #torch.cuda.set_device(hvd.local_rank()) # pin GPU to local rank
    # limit # of CPU threads to be used per worker : FIXME: why do this?
    torch.set_num_threads(1)
    file_name, _ = os.path.splitext(file_name)
    file_path = os.path.join(root_dir, file_name + '.pt')
    if hvd.rank() == 0:
        assert(len(dataset) == (len(index['train']) + len(index['val']))),\
            "Size mismatch between dataset and index"
    # Partition dataset among workers using DistributedSampler
    sampler = {
        phase: DistributedSampler(Subset(dataset, index[phase]),
                                  num_replicas=hvd.size(),
                                  rank=hvd.rank())
        for phase in ['train', 'val']
    }
    dataloader = {
        phase: DataLoader(dataset,
                          sampler=sampler[phase],
                          batch_size=batch_size,
                          num_workers=n_workers,
                          collate_fn=collate_fn,
                          pin_memory=True)
        for phase in ['train', 'val']
    }
    # FIXME: pin_memory? perhaps pin_memory means putting data on the gpu,
    n_data = {phase: len(index[phase]) for phase in ['train', 'val']}

    input_size = dataset[0]['data'].shape[-2]
    meta = torch.load(file_path)
    output_size = len(meta['labels_lut']) if classifier else 1
    rnn = RNN(input_size,
              hidden_size=hidden_size,
              output_size=output_size,
              n_layers=n_layers,
              bidirectional=bidirectional).cuda()
    optimizer = optim.Adam(rnn.parameters(), lr=lr, betas=betas)
    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=rnn.named_parameters())
    # Broadcast parameters from rank 0 to all other processes.
    hvd.broadcast_parameters(rnn.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)
    # apex
    rnn, optimizer = amp.initialize(rnn, optimizer, opt_level=opt_level)

    criterion = nn.CrossEntropyLoss(
        reduction='sum') if classifier else nn.MSELoss(reduction='sum')
    criterion = criterion.cuda()
    metric = 'cross_entropy_mean' if classifier else 'rmse'
    time_start = time.time()
    for epoch in range(n_epochs_max):
        loss_sum = {}
        loss_metric = {}
        for phase in ['train', 'val']:
            rnn.train(phase == 'train')
            loss_sum[phase] = 0.
            for batch in dataloader[phase]:
                # permute s.t. shape is (data_len, n_data_total, n_channels * (n_scat_nodes))
                batch_data = batch['data'].permute([2, 0, 1]).cuda()
                batch_labels = batch['labels'].cuda()
                input_lens = batch['input_lens'].cuda()
                output = rnn(batch_data, input_lens=input_lens)
                # for regression, output of rnn is shaped (batch_size, 1). drop dummy axis
                if classifier:
                    batch_labels = batch_labels.type(torch.cuda.LongTensor)
                else:
                    output = output[:, 0]
                loss = criterion(output, batch_labels)
                optimizer.zero_grad()
                if phase == 'train':
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        optimizer.synchronize()
                    with optimizer.skip_synchronize():
                        optimizer.step()
                loss_sum[phase] += loss.data.item()
            # classification: cross entropy mean, regression: RMSE loss per data point
            loss_metric[phase] = loss_sum[phase] / n_data[
                phase] if classifier else np.sqrt(loss_sum[phase] /
                                                  n_data[phase])
        if epoch % log_interval == 0 and hvd.rank() == 0:
            time_curr = time.time()
            elapsed = time_curr - time_start
            loss_msg = (
                "\t{} out of {} epochs, {}_train:{:.15f}, {}_val:{:.15f}, elapsed seconds:{:.2f}"
                .format(epoch, n_epochs_max, metric, loss_metric['train'],
                        metric, loss_metric['val'], elapsed))
            print(loss_msg)
            meta = torch.load(file_path)
            if classifier:
                meta['epoch'].append(epoch)
                meta['elapsed'].append(elapsed)
                #meta['weights'] = rnn.state_dict()
                for phase in ['train', 'val']:
                    meta['loss'][phase].append(loss_metric[phase])
            else:
                meta['epoch'].append(epoch)
                meta['elapsed'].append(elapsed)
                #meta['weights'] = rnn.state_dict()
                for phase in ['train', 'val']:
                    meta['loss'][phase].append(loss_metric[phase])
            torch.save(meta, file_path)
Example #23
0
def start_training(cfg):
    set_random_seed(cfg.seed)

    n_gpu = hvd.size()
    cfg.n_gpu = n_gpu
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    if hvd.rank() != 0:
        LOGGER.disabled = True
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              bool(cfg.fp16)))

    model = setup_model(cfg, device=device)
    model.train()
    optimizer = setup_e2e_optimizer(model, cfg)

    # Horovod: (optional) compression algorithm.compressin
    compression = hvd.Compression.none
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression)

    #  Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      enabled=cfg.fp16,
                                      opt_level='O2',
                                      keep_batchnorm_fp32=True)

    # prepare data
    tokenizer = BertTokenizerFast.from_pretrained(cfg.tokenizer_dir)
    train_loader, val_loader = setup_dataloaders(cfg, tokenizer)
    eval_loader = mk_video_ret_eval_dataloader(
        anno_path=cfg.val_datasets[0].txt,
        lmdb_dir=cfg.val_datasets[0].img,
        cfg=cfg,
        tokenizer=tokenizer,
    )

    # compute the number of steps and update cfg
    total_n_examples = len(train_loader.dataset) * cfg.max_n_example_per_group
    total_train_batch_size = int(n_gpu * cfg.train_batch_size *
                                 cfg.gradient_accumulation_steps *
                                 cfg.max_n_example_per_group)
    cfg.num_train_steps = int(
        math.ceil(1. * cfg.num_train_epochs * total_n_examples /
                  total_train_batch_size))

    cfg.valid_steps = int(
        math.ceil(1. * cfg.num_train_steps / cfg.num_valid /
                  cfg.min_valid_steps)) * cfg.min_valid_steps
    actual_num_valid = int(
        math.floor(1. * cfg.num_train_steps / cfg.valid_steps)) + 1

    # restore
    restorer = TrainingRestorer(cfg, model, optimizer)
    global_step = restorer.global_step
    TB_LOGGER.global_step = global_step
    if hvd.rank() == 0:
        LOGGER.info("Saving training meta...")
        save_training_meta(cfg)
        path = join(cfg.output_dir, 'log', "detectron2_model_cfg.yaml")
        with open(path, "w") as f:
            f.write(model.cnn.config_file)
        LOGGER.info("Saving training done...")
        TB_LOGGER.create(join(cfg.output_dir, 'log'))
        pbar = tqdm(total=cfg.num_train_steps)
        model_saver = ModelSaver(join(cfg.output_dir, "ckpt"))
        add_log_to_file(join(cfg.output_dir, "log", "log.txt"))
    else:
        LOGGER.disabled = True
        pbar = NoOp()
        model_saver = NoOp()
        restorer = NoOp()

    if global_step > 0:
        pbar.update(global_step)

    LOGGER.info(cfg)
    LOGGER.info("Starting training...")
    LOGGER.info(f"***** Running training with {n_gpu} GPUs *****")
    LOGGER.info(
        f"  Single-GPU Non-Accumulated batch size = {cfg.train_batch_size}")
    LOGGER.info(f"  max_n_example_per_group = {cfg.max_n_example_per_group}")
    LOGGER.info(f"  Accumulate steps = {cfg.gradient_accumulation_steps}")
    LOGGER.info(
        f"  Total batch size = #GPUs * Single-GPU batch size * "
        f"max_n_example_per_group * Accumulate steps [Image] = {total_train_batch_size}"
    )
    LOGGER.info(f"  Total #epochs = {cfg.num_train_epochs}")
    LOGGER.info(f"  Total #steps = {cfg.num_train_steps}")
    LOGGER.info(
        f"  Validate every {cfg.valid_steps} steps, in total {actual_num_valid} times"
    )

    # quick hack for amp delay_unscale bug
    with optimizer.skip_synchronize():
        optimizer.zero_grad()
        if global_step == 0:
            optimizer.step()
    debug_step = 3
    running_loss = RunningMeter('train_loss')

    for step, batch in enumerate(InfiniteIterator(train_loader)):
        # forward pass
        del batch["caption_ids"]
        mini_batch = dict()
        for k, v in batch.items():
            if k != "visual_inputs":
                mini_batch[k] = v

        pool_method = cfg.score_agg_func
        # could be 1, where only a single clip is used
        num_clips = cfg.train_n_clips
        num_frm = cfg.num_frm
        # (B, T=num_clips*num_frm, C, H, W) --> (B, num_clips, num_frm, C, H, W)
        bsz = batch["visual_inputs"].shape[0]
        new_visual_shape = (bsz, num_clips,
                            num_frm) + batch["visual_inputs"].shape[2:]
        visual_inputs = batch["visual_inputs"].view(*new_visual_shape)
        logits = []
        for clip_idx in range(num_clips):
            # (B, num_frm, C, H, W)
            mini_batch["visual_inputs"] = visual_inputs[:, clip_idx]
            mini_batch["n_examples_list"] = batch["n_examples_list"]
            outputs = forward_step(model, mini_batch, cfg)
            logits.append(outputs["logits"])
            # the losses are cross entropy and mse, no need to * num_labels

        logits = torch.stack(logits)  # (num_frm, B, 5)
        if pool_method == "mean":
            logits = logits.mean(0)  # (B, 5)
        elif pool_method == "max":
            logits = logits.max(0)[0]  # (B, 5)
        elif pool_method == "lse":
            logits = logits.permute(
                1, 0,
                2).contiguous()  # (B, num_frm, 5), pooling will be done in CE
        else:
            raise ValueError(
                f"Invalid value for pool_method, "
                f"got {pool_method}, expect one of [`mean`, `max`, `lse`]")

        if pool_method == "lse":
            out = torch.logsumexp(logits.view(logits.shape[0], -1), dim=-1, keepdim=True) \
                - torch.logsumexp(logits, dim=1)
            loss = torch.gather(out, -1, batch["labels"].view(-1, 1))
        else:
            _, loss = model.transformer.calc_loss(
                logits,
                batch["labels"],
                sample_size=len(batch["n_examples_list"]))
        loss = loss.mean()

        running_loss(loss.item())
        # backward pass
        delay_unscale = (step + 1) % cfg.gradient_accumulation_steps != 0
        with amp.scale_loss(loss, optimizer,
                            delay_unscale=delay_unscale) as scaled_loss:
            scaled_loss.backward()
            zero_none_grad(model)
            optimizer.synchronize()

        # optimizer
        if (step + 1) % cfg.gradient_accumulation_steps == 0:
            global_step += 1

            # learning rate scheduling
            n_epoch = int(1. * total_train_batch_size * global_step /
                          total_n_examples)
            # learning rate scheduling transformer
            lr_this_step_transformer = get_lr_sched(
                global_step,
                cfg.decay,
                cfg.learning_rate,
                cfg.num_train_steps,
                warmup_ratio=cfg.warmup_ratio,
                decay_epochs=cfg.step_decay_epochs,
                multi_step_epoch=n_epoch)

            # learning rate scheduling cnn
            lr_this_step_cnn = get_lr_sched(
                global_step,
                cfg.cnn_lr_decay,
                cfg.cnn_learning_rate,
                cfg.num_train_steps,
                warmup_ratio=cfg.warmup_ratio,
                decay_epochs=cfg.cnn_step_decay_epochs,
                multi_step_epoch=n_epoch)

            # Hardcoded param group length
            assert len(optimizer.param_groups) == 8
            for pg_n, param_group in enumerate(optimizer.param_groups):
                if pg_n in [0, 1]:
                    param_group['lr'] = (cfg.transformer_lr_mul *
                                         lr_this_step_transformer)
                elif pg_n in [2, 3]:
                    param_group['lr'] = lr_this_step_transformer
                elif pg_n in [4, 5]:
                    param_group['lr'] = (cfg.cnn_lr_mul * lr_this_step_cnn)
                else:
                    param_group['lr'] = lr_this_step_cnn
            TB_LOGGER.add_scalar("train/lr_transformer",
                                 lr_this_step_transformer, global_step)
            TB_LOGGER.add_scalar("train/lr_cnn", lr_this_step_cnn, global_step)

            TB_LOGGER.add_scalar('train/loss', running_loss.val, global_step)

            # update model params
            if cfg.grad_norm != -1:
                grad_norm = clip_grad_norm_(amp.master_params(optimizer),
                                            cfg.grad_norm)
                TB_LOGGER.add_scalar("train/grad_norm", grad_norm, global_step)
            TB_LOGGER.step()

            # Check if there is None grad
            none_grads = [
                p[0] for p in model.named_parameters()
                if p[1].requires_grad and p[1].grad is None
            ]

            assert len(none_grads) == 0, f"{none_grads}"

            with optimizer.skip_synchronize():
                optimizer.step()
                optimizer.zero_grad()
            restorer.step()
            pbar.update(1)

            # checkpoint
            if global_step % cfg.valid_steps == 0:
                LOGGER.info(f'Step {global_step}: start validation')
                validate(model,
                         val_loader,
                         eval_loader,
                         cfg,
                         global_step,
                         eval_filepath=cfg.val_datasets[0].txt)
                model_saver.save(step=global_step, model=model)
        if global_step >= cfg.num_train_steps:
            break

        if cfg.debug and global_step >= debug_step:
            break

    if global_step % cfg.valid_steps != 0:
        LOGGER.info(f'Step {global_step}: start validation')
        validate(model,
                 val_loader,
                 eval_loader,
                 cfg,
                 global_step,
                 eval_filepath=cfg.val_datasets[0].txt)
        model_saver.save(step=global_step, model=model)
Example #24
0
def main():
   
    hvd.init()
    torch.manual_seed(args.seed)
    # Horovod: print logs on the first worker.
    verbose = 1 if hvd.rank() == 0 else 0

    # Horovod: write TensorBoard logs on first worker.
    log_writer = tensorboardX.SummaryWriter(args.log_dir) if hvd.rank() == 0 else None
    
    kwargs = {'num_workers': 0, 'pin_memory': True}

    print("======= START LOADING DATA =========")
    #train_n = len(os.listdir(args.train_dir))
    train_n = 30
    train_files = sorted(os.listdir(args.train_dir))[:train_n]
    #train_files = ["batch_train_{}.h5".format(i) for i in range(train_n)]
    #train_files = ["batch_train_0.h5", "batch_train_1.h5"] 
    train_dataset = HDF5Dataset(args.train_dir, train_files)
    '''
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=2)
    '''
    
    val_n = 10
    val_files = sorted(os.listdir(args.val_dir))[:val_n]
    val_dataset = HDF5Dataset(args.val_dir, val_files)
    '''
    val_loader = DataLoader(dataset=val_dataset, 
                            batch_size=args.batch_size, 
                            shuffle=True)
    '''
    print("Training size")
    print(len(train_dataset))
    print("Dev size")
    print("Test size")
    print(len(val_dataset))
   
    # Horovod: use DistributedSampler to partition data among workers. Manually specify
    # `num_replicas=hvd.size()` and `rank=hvd.rank()`.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=args.batch_size,
            sampler=train_sampler, **kwargs)
    val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    val_loader = torch.utils.data.DataLoader(
            val_dataset, batch_size=args.val_batch_size,
            sampler=val_sampler, **kwargs)

    # Set up standard ResNet-50 model.
    model = models.resnet50()
    model.cuda()

    # Horovod: scale learning rate by the number of GPUs.
    # Gradient Accumulation: scale learning rate by batches_per_allreduce
    optimizer = optim.SGD(model.parameters(), lr=0.01, 
                          momentum=args.momentum, weight_decay=args.wd)
    compression = hvd.Compression.fp16

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
                optimizer, named_parameters=model.named_parameters(),
                compression=compression,
                backward_passes_per_step=args.batches_per_allreduce)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    def train(epoch):
        model.train()
        train_sampler.set_epoch(epoch)
        train_loss = Metric('train_loss')
        train_accuracy = Metric('train_accuracy')

        with tqdm(total=len(train_loader),
                  desc='Train Epoch     #{}'.format(epoch + 1),
                  disable=not verbose) as t:
            for batch_idx, (data, target) in enumerate(train_loader):
                # print(data, target)
                if args.cuda:
                    data, target = data.cuda(), target.cuda()
                optimizer.zero_grad()
                # Split data into sub-batches of size batch_size
                for i in range(0, len(data), args.batch_size):
                    data_batch = data[i:i + args.batch_size]
                    target_batch = target[i:i + args.batch_size]
                    output = model(data_batch)
                    train_accuracy.update(accuracy(output, target_batch))
                    loss = F.cross_entropy(output, target_batch)
                    train_loss.update(loss)
                    # Average gradients among sub-batches
                    loss.div_(math.ceil(float(len(data)) / args.batch_size))
                    loss.backward()
                # Gradient is applied across all ranks
                optimizer.step()
                t.set_postfix({'loss': train_loss.avg.item(),
                               'accuracy': 100. * train_accuracy.avg.item()})
            t.update(1)

        if log_writer:
            log_writer.add_scalar('train/loss', train_loss.avg, epoch)
            log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)


    def validate(epoch):
        model.eval()
        val_loss = Metric('val_loss')
        val_accuracy = Metric('val_accuracy')

        with tqdm(total=len(val_loader),
                  desc='Validate Epoch  #{}'.format(epoch + 1),
                  disable=not verbose) as t:
            with torch.no_grad():
                for data, target in val_loader:
                    if args.cuda:
                        data, target = data.cuda(), target.cuda()
                    output = model(data)

                    val_loss.update(F.cross_entropy(output, target))
                    val_accuracy.update(accuracy(output, target))
                    t.set_postfix({'loss': val_loss.avg.item(),
                                   'accuracy': 100. * val_accuracy.avg.item()})
                    t.update(1)

        if log_writer:
            log_writer.add_scalar('val/loss', val_loss.avg, epoch)
            log_writer.add_scalar('val/accuracy', val_accuracy.avg, epoch)
    
    def accuracy(output, target):
        # get the index of the max log-probability
        pred = output.max(1, keepdim=True)[1]
        return pred.eq(target.view_as(pred)).cpu().float().mean()
        

    # Horovod: average metrics from distributed training.
    class Metric(object):
        def __init__(self, name):
            self.name = name
            self.sum = torch.tensor(0.)
            self.n = torch.tensor(0.)

        def update(self, val):
            self.sum += hvd.allreduce(val.detach().cpu(), name=self.name)
            self.n += 1

        @property
        def avg(self):
            return self.sum / self.n


    for epoch in range(resume_from_epoch, args.epochs):
        train(epoch)
        validate(epoch)
Example #25
0
epochCurrent = checkpoint['epoch']
lossesG = checkpoint['lossesG']
lossesD = checkpoint['lossesD']
num_vid = checkpoint['num_vid']
i_batch_current = checkpoint['i_batch'] + 1

G.train()
E.train()
D.train()

# Horovod broadcast parameters from rank 0 to all other processes.
hvd.broadcast_parameters(G.state_dict(), root_rank=0)
hvd.broadcast_parameters(E.state_dict(), root_rank=0)
hvd.broadcast_parameters(D.state_dict(), root_rank=0)

hvd.broadcast_optimizer_state(optimizerG, root_rank=0)
hvd.broadcast_optimizer_state(optimizerE, root_rank=0)
hvd.broadcast_optimizer_state(optimizerD, root_rank=0)

optimizerG = hvd.DistributedOptimizer(optimizerG,
                                      named_parameters=G.named_parameters())
optimizerE = hvd.DistributedOptimizer(optimizerE,
                                      named_parameters=E.named_parameters())
optimizerD = hvd.DistributedOptimizer(optimizerD,
                                      named_parameters=D.named_parameters())

print("Start training")

# Training
batch_start = datetime.now()
for epoch in range(epochCurrent, num_epochs):
Example #26
0
    def setup(self, model):
        # call setup after the ddp process has connected
        self.trainer.call_setup_hook(model)

        if torch.cuda.is_available() and self.trainer.on_gpu:
            # Horovod: pin GPU to local rank
            assert self.trainer.root_gpu == hvd.local_rank()
            torch.cuda.set_device(self.trainer.root_gpu)
            model.cuda(self.trainer.root_gpu)

        # avoid duplicating progress bar
        if hvd.rank() != 0 and self.trainer.progress_bar_callback is not None:
            self.trainer.progress_bar_callback.disable()

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(
            model)
        self.trainer.optimizers = optimizers
        self.trainer.lr_schedulers = lr_schedulers
        self.trainer.optimizer_frequencies = optimizer_frequencies

        # Horovod: scale the learning rate by the number of workers to account for
        # increased total batch size
        for optimizer in self.trainer.optimizers:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= hvd.size()

        # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR
        for scheduler in self.trainer.lr_schedulers:
            scheduler = scheduler['scheduler']
            if isinstance(scheduler, _LRScheduler):
                scheduler.base_lrs = [
                    lr * hvd.size() for lr in scheduler.base_lrs
                ]

        if self.trainer.amp_backend:
            model, optimizers = model.configure_apex(amp, model,
                                                     self.trainer.optimizers,
                                                     self.trainer.amp_level)
            self.trainer.optimizers = optimizers
            self.trainer.reinit_scheduler_properties(
                self.trainer.optimizers, self.trainer.lr_schedulers)

        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        for optimizer in self.trainer.optimizers:
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        def filter_named_parameters(model, optimizer):
            opt_params = set([
                p for group in optimizer.param_groups
                for p in group.get('params', [])
            ])
            return [(name, p) for name, p in model.named_parameters()
                    if p in opt_params]

        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
        self.trainer.optimizers = [
            hvd.DistributedOptimizer(optimizer,
                                     named_parameters=filter_named_parameters(
                                         model, optimizer))
            for optimizer in self.trainer.optimizers
        ]

        # Update logger rank info from Horovod to avoid race conditions from  different ranks
        # creating directories / writing files in the same locations.
        self.trainer.global_rank = hvd.rank()
        rank_zero_only.rank = self.trainer.global_rank

        self.trainer.model = model
Example #27
0
def main():
    if hvd.rank() == 0:
        logger.info("Logger is set - training start")

    # set default gpu device id
    # torch.cuda.set_device(config.gpus[0])

    # set seed
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    # torch.cuda.manual_seed_all(config.seed)

    torch.backends.cudnn.benchmark = False

    # get data with meta info
    (train_X, train_y), (valid_X, valid_y) = load_data()
    in_dim = np.shape(train_X)[1]
    out_dim = np.shape(train_y)[1]

    train_X, train_y = (torch.tensor(train_X,
                                     dtype=torch.float), torch.tensor(train_y))
    train_data = torch.utils.data.TensorDataset(train_X, train_y)

    valid_X, valid_y = (torch.tensor(valid_X,
                                     dtype=torch.float), torch.tensor(valid_y))
    valid_data = torch.utils.data.TensorDataset(valid_X, valid_y)
    print("in_dim: ", in_dim)
    print("out_dim: ", out_dim)

    net_crit = nn.MSELoss().to(device)
    layers = 1
    n_nodes = 4
    model = SearchFCNNController(in_dim,
                                 out_dim,
                                 layers,
                                 net_crit,
                                 n_nodes=n_nodes,
                                 device_ids=config.gpus)
    model = model.to(device)

    # weights optimizer
    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size()
    # w_optim = torch.optim.SGD(
    #     model.weights(),
    #     config.w_lr * lr_scaler,
    #     momentum=config.w_momentum,
    #     weight_decay=config.w_weight_decay,
    # )
    w_optim = torch.optim.Adagrad(model.weights(),
                                  config.w_lr * lr_scaler,
                                  weight_decay=config.w_weight_decay)
    # w_optim = torch.optim.RMSprop(model.weights())

    # alphas optimizer
    alpha_lr = config.alpha_lr
    alpha_optim = torch.optim.Adam(
        model.alphas(),
        alpha_lr,
        betas=(0.5, 0.999),
        weight_decay=config.alpha_weight_decay,
    )

    # split data to train/validation
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_data, num_replicas=hvd.size(), rank=hvd.rank())
    # valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices_valid)
    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_data, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=config.batch_size,
        sampler=train_sampler,
        num_workers=config.workers,
        pin_memory=True,
    )
    # vis.
    # dataiter = iter(train_loader)
    # images, labels = dataiter.next()
    # writer.add_graph(model, [images[0]])
    # writer.close()

    valid_loader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=config.batch_size,
        sampler=valid_sampler,
        num_workers=config.workers,
        pin_memory=True,
    )
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        w_optim, config.epochs, eta_min=config.w_lr_min)
    architect = Architect(model,
                          config.w_momentum,
                          config.w_weight_decay,
                          allow_unused=False)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(w_optim, root_rank=0)

    # Horovod: (optional) compression algorithm.
    # compression = hvd.Compression.fp16

    # Horovod: wrap optimizer with DistributedOptimizer.
    w_optim = hvd.DistributedOptimizer(
        w_optim,
        named_parameters=model.named_parameters(),
        #  compression=compression,
        # op=hvd.Adasum,
        op=hvd.Average,
    )

    # training loop
    best_top1 = None
    epochs = config.epochs
    for epoch in range(epochs):
        lr = lr_scheduler.get_lr()[0]

        if hvd.rank() == 0:
            model.print_alphas(logger)

        # training
        train(
            train_loader,
            valid_loader,
            model,
            architect,
            w_optim,
            alpha_optim,
            lr,
            epoch,
            train_sampler,
        )
        lr_scheduler.step()

        # validation
        cur_step = (epoch + 1) * len(train_loader)
        top1 = validate(valid_loader, model, epoch, cur_step)

        top1 = metric_average(top1, name="avg_val_top1")

        if hvd.rank() == 0:
            # log
            # genotype
            genotype = model.genotype()
            logger.info("genotype = {}".format(genotype))

            # genotype as a image
            plot_path = "." + os.path.join(config.plot_path,
                                           "EP{:02d}".format(epoch + 1))
            caption = "Epoch {}".format(epoch + 1)
            plot(genotype.normal, plot_path + "-normal", caption)

            # save
            if best_top1 is None or best_top1 < top1:
                best_top1 = top1
                best_genotype = genotype
                is_best = True
            else:
                is_best = False
            # utils.save_checkpoint(model, "." + config.path, is_best)
            print("")

    if hvd.rank() == 0:
        best_genotype = model.genotype()

        with open("." + config.path + "/best_genotype.txt", "w") as f:
            f.write(str(best_genotype))

        logger.info("Final best TopR2@1 = {:.3f}".format(best_top1))
        logger.info("Best Genotype = {}".format(best_genotype))
Example #28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-exp_dir")
    parser.add_argument("-dataPath",
                        default='',
                        type=str,
                        help="path of data files")
    parser.add_argument("-train_config")
    parser.add_argument("-data_config")
    parser.add_argument("-lr",
                        default=0.0001,
                        type=float,
                        help="Override the LR in the config")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-data_loader_threads",
                        default=0,
                        type=int,
                        help="number of workers for data loading")
    parser.add_argument("-max_grad_norm",
                        default=5,
                        type=float,
                        help="max_grad_norm for gradient clipping")
    parser.add_argument("-sweep_size",
                        default=200,
                        type=float,
                        help="process n hours of data per sweep (default:200)")
    parser.add_argument("-num_epochs",
                        default=1,
                        type=int,
                        help="number of training epochs (default:1)")
    parser.add_argument("-global_mvn",
                        default=False,
                        type=bool,
                        help="if apply global mean and variance normalization")
    parser.add_argument(
        "-resume_from_model",
        type=str,
        help="the model from which you want to resume training")
    parser.add_argument("-dropout", type=float, help="set the dropout ratio")
    parser.add_argument("-aneal_lr_epoch",
                        default=2,
                        type=int,
                        help="start to aneal the learning rate from this epoch"
                        )  # aneal -> anneal?
    parser.add_argument("-aneal_lr_ratio",
                        default=0.5,
                        type=float,
                        help="the ratio to aneal the learning rate")
    parser.add_argument('-p',
                        '--print-freq',
                        default=100,
                        type=int,
                        metavar='N',
                        help='print frequency (default: 100)')
    parser.add_argument('-hvd',
                        default=False,
                        type=bool,
                        help="whether to use horovod for training")

    args = parser.parse_args()

    with open(args.train_config) as f:
        config = yaml.safe_load(f)

    config["sweep_size"] = args.sweep_size
    with open(args.data_config) as f:
        data = yaml.safe_load(f)
        config["source_paths"] = [j for i, j in data['clean_source'].items()]
        if 'dir_noise' in data:
            config["dir_noise_paths"] = [
                j for i, j in data['dir_noise'].items()
            ]
        if 'rir' in data:
            config["rir_paths"] = [j for i, j in data['rir'].items()]

    config['data_path'] = args.dataPath

    print("Experiment starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    # Initialize Horovod
    if args.hvd:
        import horovod.torch as hvd
        hvd.init()
        th.cuda.set_device(hvd.local_rank())
        print("Run experiments with world size {}".format(hvd.size()))

    if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    trainset = SpeechDataset(config)
    train_dataloader = ChunkDataloader(trainset,
                                       batch_size=args.batch_size,
                                       distributed=args.multi_gpu,
                                       num_workers=args.data_loader_threads)

    if args.global_mvn:
        transform = GlobalMeanVarianceNormalization()
        print("Estimating global mean and variance of feature vectors...")
        transform.learn_mean_and_variance_from_train_loader(
            trainset, trainset.stream_idx_for_transform, n_sample_to_use=2000)
        trainset.transform = transform
        print("Global mean and variance transform trained successfully!")

        with open(args.exp_dir + "/transform.pkl", 'wb') as f:
            pickle.dump(transform, f, pickle.HIGHEST_PROTOCOL)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(train_dataloader)))

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    # Start training
    th.backends.cudnn.enabled = True
    if th.cuda.is_available():
        model.cuda()

    # optimizer
    optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)

    if args.hvd:
        # Broadcast parameters and opterimizer state from rank 0 to all other processes.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        # Add Horovod Distributed Optimizer
        optimizer = hvd.DistributedOptimizer(
            optimizer, named_parameters=model.named_parameters())

    # criterion
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    start_epoch = 0
    if args.resume_from_model:

        assert os.path.isfile(args.resume_from_model
                              ), "ERROR: model file {} does not exit!".format(
                                  args.resume_from_model)

        checkpoint = th.load(args.resume_from_model)
        state_dict = checkpoint['model']
        start_epoch = checkpoint['epoch']
        model.load_state_dict(state_dict)
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' ".format(args.resume_from_model))

    model.train()
    for epoch in range(start_epoch, args.num_epochs):

        # aneal learning rate
        if epoch > args.aneal_lr_epoch:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= args.aneal_lr_ratio

        run_train_epoch(model, optimizer, criterion, train_dataloader, epoch,
                        args)

        # save model
        if not args.hvd or hvd.rank() == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            checkpoint['epoch'] = epoch
            output_file = args.exp_dir + '/model.' + str(epoch) + '.tar'
            th.save(checkpoint, output_file)
Example #29
0
    def test_broadcast_state(self):
        hvd.init()

        N, D_in, H, D_out = 64, 100, 10, 10
        x = torch.randn(N, D_in).requires_grad_()
        y = torch.randn(N, D_out).requires_grad_()

        def new_optimizer(cls, opt_params, model):
            p = {
                k: v
                for k, v in opt_params.items()
                if k in inspect.getargspec(cls.__init__).args
            }
            return cls(model.parameters(), **p)

        def create_model(opt_class, opt_params):
            model = torch.nn.Sequential(
                torch.nn.Linear(D_in, H),
                torch.nn.ReLU(),
                torch.nn.Linear(H, D_out),
            )

            optimizer = new_optimizer(opt_class, opt_params, model)
            optimizer = hvd.DistributedOptimizer(
                optimizer, named_parameters=model.named_parameters())

            return model, optimizer

        def get_model_param_values(model):
            params = sorted(model.state_dict().items())
            return [(k, v.clone()) for k, v in params]

        def get_optimizer_param_values(optimizer):
            results = []
            state_dict = optimizer.state_dict()
            for group in state_dict['param_groups']:
                for param_id in group['params']:
                    if param_id not in state_dict['state']:
                        continue
                    params = sorted(state_dict['state'][param_id].items())
                    for k, v in params:
                        results.append(
                            (k, v.clone() if torch.is_tensor(v) else v))
            return results

        # L-BFGS is currently unsupported, as are sparse tensors, which are
        # required by SparseAdam optimizer
        optimizers = [
            (subclass.__name__, subclass)
            for subclass in torch.optim.Optimizer.__subclasses__()
            if subclass.__module__.startswith('torch.optim') and subclass !=
            torch.optim.LBFGS and subclass != torch.optim.SparseAdam
        ]
        optimizers.sort()

        opt_params_list = [
            dict(lr=0.2, momentum=0.9, weight_decay=0.1, centered=True),
            dict(lr=0.2)
        ]

        for (opt_name, opt_class), opt_params in itertools.product(
                optimizers, opt_params_list):
            model, optimizer = create_model(opt_class, opt_params)
            y_pred = model(x)
            loss = F.mse_loss(y_pred, y, size_average=False)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            model_param_values = get_model_param_values(model)
            for name, model_param_value in model_param_values:
                hvd.broadcast_(model_param_value, root_rank=0)

            opt_param_values_updated = []
            opt_param_values = get_optimizer_param_values(optimizer)
            for name, opt_param_value in opt_param_values:
                is_tensor = torch.is_tensor(opt_param_value)
                if not is_tensor:
                    t = type(opt_param_value)
                    opt_param_value = torch.Tensor([opt_param_value])
                hvd.broadcast_(opt_param_value, root_rank=0)
                if not is_tensor:
                    opt_param_value = t(opt_param_value.cpu().numpy()[0])
                opt_param_values_updated.append((name, opt_param_value))
            opt_param_values = opt_param_values_updated

            if hvd.rank() == 0:
                state = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                }
                _, fname = tempfile.mkstemp('.pt')
                torch.save(state, fname)

            model, optimizer = create_model(opt_class, opt_params)
            if hvd.rank() == 0:
                checkpoint = torch.load(fname)
                model.load_state_dict(checkpoint['model'])
                optimizer.load_state_dict(checkpoint['optimizer'])
                os.remove(fname)

            hvd.broadcast_parameters(model.state_dict(), root_rank=0)
            model_param_value_after = get_model_param_values(model)
            for before, after in zip(model_param_values,
                                     model_param_value_after):
                name, model_param_value = before
                name_after, model_param_value_after = after
                self.assertEqual(name, name_after)
                self.assertEqual(type(model_param_value),
                                 type(model_param_value_after))
                self.assertTrue(
                    (model_param_value == model_param_value_after).all())

            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

            expected_tensors = 4
            if 'momentum' not in opt_params and opt_class == torch.optim.SGD:
                # SGD only maintains state when momentum is specified, otherwise
                # it does not populate the state dict, so it will contain no tensors.
                expected_tensors = 0
            self.assertEqual(len(optimizer.state_dict()['state'].values()),
                             expected_tensors)

            opt_param_values_after = get_optimizer_param_values(optimizer)
            for before, after in zip(opt_param_values, opt_param_values_after):
                name, opt_param_value = before
                name_after, opt_param_value_after = after
                self.assertEqual(name, name_after)
                self.assertEqual(type(opt_param_value),
                                 type(opt_param_value_after))
                if torch.is_tensor(opt_param_value):
                    self.assertTrue(
                        (opt_param_value == opt_param_value_after).all())
                else:
                    self.assertEqual(opt_param_value, opt_param_value_after)
Example #30
0
    },
)
model = Model(
    embedding_table_shapes=EMBEDDING_TABLE_SHAPES_TUPLE,
    num_continuous=0,
    emb_dropout=0.0,
    layer_hidden_dims=[128, 128, 128],
    layer_dropout_rates=[0.0, 0.0, 0.0],
).cuda()

lr_scaler = hvd.size()

optimizer = torch.optim.Adam(model.parameters(), lr=0.01 * lr_scaler)

hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)

optimizer = hvd.DistributedOptimizer(optimizer,
                                     named_parameters=model.named_parameters())

for epoch in range(args.epochs):
    start = time()
    print(f"Training epoch {epoch}")
    train_loss, y_pred, y = process_epoch(train_loader,
                                          model,
                                          train=True,
                                          optimizer=optimizer)
    hvd.join(gpu_to_use)
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    print(f"Epoch {epoch:02d}. Train loss: {train_loss:.4f}.")
    hvd.join(gpu_to_use)
def main_worker(args):
    global best_acc1

    if hvd.rank() == 0:
        print("=> creating model '{}'".format(args.arch))
    model = models.__dict__[args.arch]()

    # Move model to GPU.
    model.cuda()

    # Use CrossEntropyLoss with multi-class classification.
    criterion = nn.CrossEntropyLoss().cuda()

    # Default hyperparameters based on PyTorch Imagenet examples.
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=0.9,
                                weight_decay=1e-4)

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=hvd.Compression.none)

    cudnn.benchmark = True
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')

    # Default normalizations based on PyTorch Imagenet examples.
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    # Default transforms based on PyTorch Imagenet examples.
    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    # The sampler defines the strategy to draw samples from the dataset. If specified, shuffle must be False.
    # Horovod: use DistributedSampler to partition data among workers. Manually specify
    # `num_replicas=hvd.size()` and `rank=hvd.rank()`.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.bs,
        shuffle=(train_sampler is None),
        num_workers=args.num_loader_threads,
        pin_memory=True,
        sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(
            valdir,
            transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize,
            ])),
        batch_size=args.bs,
        shuffle=False,
        num_workers=args.num_loader_threads,
        pin_memory=True)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    for epoch in range(0, args.epochs):

        train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # Train the model.
        train(train_loader, model, criterion, optimizer, epoch, args)

        # Validate accuracy on valdation set.
        acc1 = validate(val_loader, model, criterion, args)

        # Remember best acc@1 and save best model.
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        # Horovod: only save model on rank 0
        if is_best and (hvd.rank() == 0):
            state = {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            }
            torch.save(state, 'model_best.pth.tar')