def train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, device="cpu"):
    optimizer.zero_grad()
    mb_adv = mb_rewards - mb_values
    adv_v = torch.FloatTensor(mb_adv).to(device)
    obs_v = torch.FloatTensor(mb_obs).to(device)
    rewards_v = torch.FloatTensor(mb_rewards).to(device)
    actions_t = torch.LongTensor(mb_actions).to(device)
    logits_v, values_v = net(obs_v)
    log_prob_v = F.log_softmax(logits_v, dim=1)
    log_prob_actions_v = adv_v * log_prob_v[range(len(mb_actions)), actions_t]

    loss_policy_v = -log_prob_actions_v.mean()
    loss_value_v = F.mse_loss(values_v.squeeze(-1), rewards_v)

    prob_v = F.softmax(logits_v, dim=1)
    entropy_loss_v = (prob_v * log_prob_v).sum(dim=1).mean()
    loss_v = ENTROPY_BETA * entropy_loss_v + VALUE_LOSS_COEF * loss_value_v + loss_policy_v
    loss_v.backward()
    nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
    optimizer.step()

    tb_tracker.track("advantage", mb_adv, step_idx)
    tb_tracker.track("values", values_v, step_idx)
    tb_tracker.track("batch_rewards", rewards_v, step_idx)
    tb_tracker.track("loss_entropy", entropy_loss_v, step_idx)
    tb_tracker.track("loss_policy", loss_policy_v, step_idx)
    tb_tracker.track("loss_value", loss_value_v, step_idx)
    tb_tracker.track("loss_total", loss_v, step_idx)
    return obs_v
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['b1'], group['b2']

                state['step'] += 1

                # Add grad clipping
                if group['max_grad_norm'] > 0:
                    clip_grad_norm_(p, group['max_grad_norm'])

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                denom = exp_avg_sq.sqrt().add_(group['e'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                schedule_fct = SCHEDULES[group['schedule']]
                lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
                step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

                # Add weight decay at the end (fixed version)
                if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0:
                    p.data.add_(-lr_scheduled * group['l2'], p.data)

        return loss
Example #3
0
    def on_grad_computed(self, _, named_parameters, **kwargs):
        if self.gradient_clip_value is None:
            return

        clip_grad_norm_(
            (p for _, p in named_parameters),
            max_norm=self.gradient_clip_value,
            norm_type=self.gradient_clip_norm_type,
        )
Example #4
0
def train(args, model, train_data_loader, dev_data_loader, accuracy, device):
    """
    Train the current model

    Keyword arguments:
    args: arguments 
    model: model to be trained
    train_data_loader: pytorch build-in data loader output for training examples
    dev_data_loader: pytorch build-in data loader output for dev examples
    accuracy: previous best accuracy
    device: cpu of gpu
    """

    model.train()
    optimizer = torch.optim.Adamax(model.parameters())
    criterion = nn.CrossEntropyLoss()
    print_loss_total = 0
    epoch_loss_total = 0
    start = time.time()

    #### modify the following code to complete the training funtion

    for idx, batch in enumerate(train_data_loader):
        question_text = batch['text'].to(device)
        question_len = batch['len']
        labels = batch['labels']

        #### Your code here



        clip_grad_norm_(model.parameters(), args.grad_clipping)
        print_loss_total += loss.data.numpy()
        epoch_loss_total += loss.data.numpy()

        if idx % args.checkpoint == 0 and idx > 0:
            print_loss_avg = print_loss_total / args.checkpoint

            print('number of steps: %d, loss: %.5f time: %.5f' % (idx, print_loss_avg, time.time()- start))
            print_loss_total = 0
            curr_accuracy = evaluate(dev_data_loader, model, device)
            if accuracy < curr_accuracy:
                torch.save(model, args.save_model)
                accuracy = curr_accuracy
    return accuracy
Example #5
0
 def f():
     grad_norm = clip_grad_norm_(
         [p for p in net.parameters() if p.requires_grad], clip_grad)
     grad_norm = grad_norm.item()
     if max_grad is not None and grad_norm >= max_grad:
         print('WARNING: Exploding Gradients {:.2f}'.format(grad_norm))
         grad_norm = max_grad
     grad_log = {}
     grad_log['grad_norm'] = grad_norm
     return grad_log
Example #6
0
    def step(self):
        """Update the model parameters based on current gradients.

        Optionally, will employ gradient modification or update learning
        rate.
        """
        learning_rate = self.learning_rate()
        if self._with_fp16_wrapper:
            if hasattr(self._optimizer, "update_master_grads"):
                self._optimizer.update_master_grads()
            if hasattr(self._optimizer, "clip_master_grads") and \
               self._max_grad_norm > 0:
                self._optimizer.clip_master_grads(self._max_grad_norm)
        for group in self._optimizer.param_groups:
            group['lr'] = learning_rate
            if not self._with_fp16_wrapper and self._max_grad_norm > 0:
                clip_grad_norm_(group['params'], self._max_grad_norm)
        self._optimizer.step()
        self._decay_step += 1
        self._training_step += 1
Example #7
0
 def f():
     grad_log = {}
     for n, m in agent.named_children():
         tot_grad = 0
         for p in m.parameters():
             if p.grad is not None:
                 tot_grad += p.grad.norm(2) ** 2
         tot_grad = tot_grad ** (1/2)
         grad_log['grad_norm'+n] = tot_grad.item()
     grad_norm = clip_grad_norm_(
         [p for p in params if p.requires_grad], clip_grad)
     grad_norm = grad_norm.item()
     if max_grad is not None and grad_norm >= max_grad:
         print('WARNING: Exploding Gradients {:.2f}'.format(grad_norm))
         grad_norm = max_grad
     grad_log['grad_norm'] = grad_norm
     return grad_log
Example #8
0
    def train(self, model, optimizer, scheduler, data_loader, device, writer,
              args):
        ''' Train one epoch
        '''
        model.train()
        clip = args.get('grad_clip', 50.0)
        log_interval = args.get('log_interval', 10)
        rank = args.get('rank', 0)
        accum_grad = args.get('accum_grad', 1)
        is_distributed = args.get('is_distributed', True)
        logging.info('using accumulate grad, new batch size is {} times'
                     'larger than before'.format(accum_grad))
        num_seen_utts = 0
        num_total_batch = len(data_loader)
        for batch_idx, batch in enumerate(data_loader):
            key, feats, target, feats_lengths, target_lengths = batch
            feats = feats.to(device)
            target = target.to(device)
            feats_lengths = feats_lengths.to(device)
            target_lengths = target_lengths.to(device)
            num_utts = target_lengths.size(0)
            if num_utts == 0:
                continue
            context = None
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
            if is_distributed and batch_idx % accum_grad != 0 :
                context = model.no_sync #DDP给我们提供了一个暂时取消梯度同步的context函数
            # Used for single gpu training and DDP gradient synchronization
            # processes.
            else:
                context = nullcontext
            with context():
                loss, loss_att, loss_ctc = model(feats,
                                                 feats_lengths,
                                                 target,
                                                 target_lengths)
                loss = loss / accum_grad
                loss.backward()

            num_seen_utts += num_utts
            if batch_idx % accum_grad == 0:
                if rank == 0 and writer is not None:
                    writer.add_scalar('train_loss', loss, self.step)
                grad_norm = clip_grad_norm_(model.parameters(), clip)
                if torch.isfinite(grad_norm):
                    optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                self.step += 1

            if batch_idx % log_interval == 0:
                lr = optimizer.param_groups[0]['lr']
                log_str = 'TRAIN Batch {}/{} loss {:.6f} '.format(
                    batch_idx, num_total_batch,
                    loss.item() * accum_grad)
                if loss_att is not None:
                    log_str += 'loss_att {:.6f} '.format(loss_att.item())
                if loss_ctc is not None:
                    log_str += 'loss_ctc {:.6f} '.format(loss_ctc.item())
                log_str += 'lr {:.8f} rank {}'.format(lr, rank)
                logging.debug(log_str)
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:  # group: list len(11): ['params'(list of tensors,len 151), 'lr', 'schedule', ...]
            for p in group[
                    'params']:  # p: Parameter: each iteration(total 151):  Size([40737,768]), Size([768,2304])
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError(
                        'Adam does not support sparse gradients, please consider SparseAdam instead'
                    )

                state = self.state[p]  # state: 1st iteration: {},

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['b1'], group['b2']

                state['step'] += 1

                # Add grad clipping
                if group['max_grad_norm'] > 0:
                    clip_grad_norm_(p, group['max_grad_norm'])

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                denom = exp_avg_sq.sqrt().add_(group['e'])

                bias_correction1 = 1 - beta1**state['step']
                bias_correction2 = 1 - beta2**state['step']

                schedule_fct = SCHEDULES[group['schedule']]
                lr_scheduled = group['lr'] * schedule_fct(
                    state['step'] / group['t_total'], group['warmup'])
                step_size = lr_scheduled * math.sqrt(
                    bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

                # Add weight decay at the end (fixed version)
                if (len(p.size()) > 1
                        or group['vector_l2']) and group['l2'] > 0:
                    p.data.add_(-lr_scheduled * group['l2'], p.data)

        return loss
Example #10
0
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        warned_for_t_total = False

        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError(
                        "Adam does not support sparse gradients, please consider SparseAdam instead"
                    )

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state["step"] = 0
                    # Exponential moving average of gradient values
                    state["next_m"] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state["next_v"] = torch.zeros_like(p.data)

                next_m, next_v = state["next_m"], state["next_v"]
                beta1, beta2 = group["b1"], group["b2"]

                # Add grad clipping
                if group["max_grad_norm"] > 0:
                    clip_grad_norm_(p, group["max_grad_norm"])

                # Decay the first and second moment running average coefficient
                # In-place operations to update the averages at the same time
                next_m.mul_(beta1).add_(1 - beta1, grad)
                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                update = next_m / (next_v.sqrt() + group["e"])

                # Just adding the square of the weights to the loss function is *not*
                # the correct way of using L2 regularization/weight decay with Adam,
                # since that will interact with the m and v parameters in strange ways.
                #
                # Instead we want to decay the weights in a manner that doesn't interact
                # with the m/v parameters. This is equivalent to adding the square
                # of the weights to the loss with plain (non-momentum) SGD.
                if group["weight_decay"] > 0.0:
                    update += group["weight_decay"] * p.data

                if group["t_total"] != -1:
                    schedule_fct = SCHEDULES[group["schedule"]]
                    progress = state["step"] / group["t_total"]
                    lr_scheduled = group["lr"] * schedule_fct(
                        progress, group["warmup"])
                    # warning for exceeding t_total (only active with warmup_linear
                    if (group["schedule"] == "warmup_linear" and progress > 1.0
                            and not warned_for_t_total):
                        logger.warning(
                            "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. "
                            "Please set 't_total' of {} correctly.".format(
                                group["schedule"], lr_scheduled,
                                self.__class__.__name__))
                        warned_for_t_total = True
                    # end warning
                else:
                    lr_scheduled = group["lr"]

                update_with_lr = lr_scheduled * update
                p.data.add_(-update_with_lr)

                state["step"] += 1

                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
                # No bias correction
                # bias_correction1 = 1 - beta1 ** state['step']
                # bias_correction2 = 1 - beta2 ** state['step']

        return loss
Example #11
0
    def update_model(self, experience: TensorTuple,
                     epsilon: float) -> TensorTuple:
        """Update PPO actor and critic networks"""
        states, actions, rewards, values, log_probs, next_state, masks = experience
        next_state = numpy2floattensor(next_state, self.device)
        with torch.no_grad():
            next_value = self.critic(next_state)

        returns = ppo_utils.compute_gae(
            next_value,
            rewards,
            masks,
            values,
            self.hyper_params.gamma,
            self.hyper_params.tau,
        )

        states = torch.cat(states)
        actions = torch.cat(actions)
        returns = torch.cat(returns).detach()
        values = torch.cat(values).detach()
        log_probs = torch.cat(log_probs).detach()
        advantages = (returns - values).detach()

        if self.hyper_params.standardize_advantage:
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-7)

        actor_losses, critic_losses, total_losses = [], [], []

        for (
                state,
                action,
                old_value,
                old_log_prob,
                return_,
                adv,
                _,
        ) in ppo_utils.ppo_iter(
                self.hyper_params.epoch,
                self.hyper_params.batch_size,
                states,
                actions,
                values,
                log_probs,
                returns,
                advantages,
        ):
            gradient_clip_ac = self.hyper_params.gradient_clip_ac
            gradient_clip_cr = self.hyper_params.gradient_clip_cr
            w_value = self.hyper_params.w_value

            # critic_loss
            value = self.critic(state)
            if self.hyper_params.use_clipped_value_loss:
                value_pred_clipped = old_value + torch.clamp(
                    (value - old_value), -epsilon, epsilon)
                value_loss_clipped = (return_ - value_pred_clipped).pow(2)
                value_loss = (return_ - value).pow(2)
                critic_loss = 0.5 * torch.max(value_loss,
                                              value_loss_clipped).mean()
            else:
                critic_loss = 0.5 * (return_ - value).pow(2).mean()
            critic_loss_ = w_value * critic_loss

            # train critic
            self.critic_optim.zero_grad()
            critic_loss_.backward()
            clip_grad_norm_(self.critic.parameters(), gradient_clip_cr)
            self.critic_optim.step()

            # calculate ratios
            _, dist = self.actor(state)
            log_prob = dist.log_prob(action)
            ratio = (log_prob - old_log_prob).exp()

            # actor_loss
            surr_loss = ratio * adv
            clipped_surr_loss = torch.clamp(ratio, 1.0 - epsilon,
                                            1.0 + epsilon) * adv
            actor_loss = -torch.min(surr_loss, clipped_surr_loss).mean()

            # entropy
            entropy = dist.entropy().mean()
            w_entropy = self.hyper_params.w_entropy
            actor_loss_ = actor_loss - w_entropy * entropy

            # train actor
            self.actor_optim.zero_grad()
            actor_loss_.backward()
            clip_grad_norm_(self.actor.parameters(), gradient_clip_ac)
            self.actor_optim.step()

            # total_loss
            total_loss = critic_loss_ + actor_loss_

            actor_losses.append(actor_loss.item())
            critic_losses.append(critic_loss.item())
            total_losses.append(total_loss.item())

        actor_loss = sum(actor_losses) / len(actor_losses)
        critic_loss = sum(critic_losses) / len(critic_losses)
        total_loss = sum(total_losses) / len(total_losses)

        return actor_loss, critic_loss, total_loss
def train_one_epoch(model,
                    optimizer,
                    train_loader,
                    model_func,
                    lr_scheduler,
                    accumulated_iter,
                    optim_cfg,
                    rank,
                    tbar,
                    total_it_each_epoch,
                    dataloader_iter,
                    tb_log=None,
                    leave_pbar=False):
    if total_it_each_epoch == len(train_loader):
        dataloader_iter = iter(train_loader)

    if rank == 0:
        pbar = tqdm.tqdm(total=total_it_each_epoch,
                         leave=leave_pbar,
                         desc='train',
                         dynamic_ncols=True)

    for cur_it in range(total_it_each_epoch):
        try:
            batch = next(dataloader_iter)
        except StopIteration:
            dataloader_iter = iter(train_loader)
            batch = next(dataloader_iter)
            print('new iters')

        lr_scheduler.step(accumulated_iter)

        try:
            cur_lr = float(optimizer.lr)
        except:
            cur_lr = optimizer.param_groups[0]['lr']

        if tb_log is not None:
            tb_log.add_scalar('learning_rate', cur_lr, accumulated_iter)

        model.train()
        optimizer.zero_grad()

        loss, tb_dict, disp_dict = model_func(model, batch)

        loss.backward()
        clip_grad_norm_(model.parameters(), optim_cfg.GRAD_NORM_CLIP)
        optimizer.step()

        accumulated_iter += 1
        disp_dict.update({'loss': loss.item(), 'lr': cur_lr})

        # log to console and tensorboard
        if rank == 0:
            pbar.update()
            pbar.set_postfix(dict(total_it=accumulated_iter))
            tbar.set_postfix(disp_dict)
            tbar.refresh()

            if tb_log is not None:
                tb_log.add_scalar('train_loss', loss, accumulated_iter)
                tb_log.add_scalar('learning_rate', cur_lr, accumulated_iter)
                for key, val in tb_dict.items():
                    tb_log.add_scalar('train_' + key, val, accumulated_iter)
    if rank == 0:
        pbar.close()
    return accumulated_iter
Example #13
0
def main(args):
    run_name = args.run_name
    gpus = args.gpu
    save_step = args.save_step
    epochs = args.epochs
    dataset_dir = args.dataset_dir
    checkpoint_dir = args.checkpoint_path
    grad_clip = args.gradient_clip
    resume = args.resume

    # optimizer related args
    learning_rate = args.learning_rate
    scheduler_step = args.scheduler_step
    scheduler_gamma = args.scheduler_gamma
    scheduler_end = args.scheduler_end

    writer = SummaryWriter(comment='/runs/{}'.format(run_name))

    latest_checkpoint_name = '{}-latest.ckpt'.format(run_name)
    latest_checkpoint_path = path.join(checkpoint_dir, latest_checkpoint_name)

    ##################################
    # -- setup dataloader / variables
    if(gpus != None):
        device = torch.device('cuda:{}'.format(gpus))
    else:
        device = torch.device('cpu')

    # Setup default values
    # TODO: setup model
    model = FooModel().to(device)
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                            step_size=scheduler_step,
                                            gamma=scheduler_gamma)
    total_step = 0
    epoch = 0

    # load from previous checkpoint if exists
    if((not path.exists(latest_checkpoint_path)) or (not resume)):
        checkpoint = CheckPoint.load(latest_checkpoint_path, device)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        epoch = checkpoint['epoch']
        total_step = checkpoint['total_step']

    #################################
    # -- setup datasets
    # TODO: setup dataset
    dataset = BarDataset()
    dataloader = DataLoader(dataset)

    #####################
    # -- Actual training
    for epoch in range(epochs):
        for i, data in enumerate(dataloader):
            if(total_step < scheduler_end):
                scheduler.step()

            # TODO: get loss somehow using model
            loss = 

            message = '[Training] Step: {:06d}, Loss: {:.04f})'
            logging.info(message.format(total_step, loss.item()))

            # reset optimizer (and clear out gradients to be applied)
            optimizer.zero_grad()

            # compute gradient
            loss.backward()

            # clip gradient if grad_clip is given
            if(grad_clip):
                utils.clip_grad_norm_(model.parameters(), grad_clip)

            # update optimizer (and actually apply gradients)
            optimizer.step()
            total_step += 1

            # write to tensorboard
            writer.add_scalar('data/loss', loss, total_step)

            # -- save the run every some time
            if((total_step) % save_step == 0):
                checkpoint_name = '{}-{}.ckpt'.format(run_name, total_step)
                checkpoint_path = path.join(checkpoint_dir, checkpoint_name)
                CheckPoint.save(checkpoint_path, model, optimizer, scheduler, total_step, epoch)
                CheckPoint.save(latest_checkpoint_path, model, optimizer, scheduler, total_step, epoch)

                # write historgram (optional, NOT recommended)
                for name, param in model.named_parameters():
                    writer.add_histogram(name, param.clone().cpu().data.numpy(), total_step)
    writer.close()
Example #14
0
    def train(self):
        start_t = time.time()
        print(f'Training started at {datetime.now()}')
        print(f'Total number of batches: {len(self.data_loader_train)}')

        best_valid_loss, best_train_epoch_loss, best_roc_auc = 10, 10, 0
        best_step_train_loss, best_step_valid_loss, best_step_valid_roc = 0, 0, 0
        drop_counter = 0
        loss_fn = self.model.loss()

        for epoch in range(self.config.num_epochs):
            epoch_loss = 0
            self.model.train()
            ctr = 0
            for ctr, (audio, target,
                      fname) in enumerate(self.data_loader_train):
                #ctr += 1
                drop_counter += 1
                audio = audio.to(self.device)
                target = target.to(self.device)

                # Time-frequency transform
                if self.transforms is not None:
                    audio = self.transforms(audio)

                # predict
                out = self.model(audio)
                loss = loss_fn(out, target)

                # back propagation
                self.optimizer.zero_grad()
                loss.backward()
                if self.config.clip_grad > 0:
                    clip_grad_norm_(self.model.parameters(),
                                    self.config.clip_grad)
                self.optimizer.step()

                epoch_loss += loss.item()

                # print log
                if (ctr) % self.config.print_every == 0:
                    print(
                        "[%s] Epoch [%d/%d] Iter [%d/%d] train loss: %.4f Elapsed: %s"
                        % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                           epoch + 1, self.config.num_epochs, ctr,
                           len(self.data_loader_train), loss.item(),
                           timedelta(seconds=time.time() - start_t)))

                if self.writer is not None:
                    step = epoch * len(self.data_loader_train) + ctr
                    self.writer.add_scalar('loss', loss.item(), step)
                    self.writer.add_scalar(
                        'learning_rate', self.optimizer.param_groups[0]['lr'],
                        step)
                    self.writer.add_scalar(
                        'grad_norm', utils.grad_norm(self.model.parameters()),
                        step)

            del audio, target
            epoch_loss = epoch_loss / len(self.data_loader_train)

            # validation
            valid_loss, scores, y_true, y_pred = self._validation(
                start_t, epoch)
            if self.scheduler is not None:
                if self.config.scheduler == 'plateau':
                    self.scheduler.step(valid_loss)
                else:
                    self.scheduler.step()

            # Log validation
            if self.writer is not None:
                step = epoch * len(self.data_loader_train) + ctr
                self.writer.add_scalar('valid_loss', valid_loss, step)
                self.writer.add_scalar('valid_roc_auc_macro',
                                       scores['roc_auc_macro'], step)
                if not self.config.debug_mode:
                    self.writer.add_figure(
                        'valid_class',
                        utils.compare_predictions(y_true,
                                                  y_pred,
                                                  filepath=None), step)

            # Save model, with respect to validation loss
            if valid_loss < best_valid_loss:
                # print('best model: %4f' % valid_loss)
                best_step_valid_loss = drop_counter
                best_valid_loss = valid_loss
                torch.save(
                    self.model.state_dict(),
                    os.path.join(self.config.checkpoint_dir,
                                 'best_model_valid_loss.pth'))

            # Save model, with respect to validation roc_auc
            if scores['roc_auc_macro'] > best_roc_auc:
                best_step_valid_roc = drop_counter
                best_roc_auc = scores['roc_auc_macro']
                torch.save(
                    self.model.state_dict(),
                    os.path.join(self.config.checkpoint_dir,
                                 'best_model_valid_roc.pth'))

            # Save best model according to training loss
            if epoch_loss < best_train_epoch_loss:
                best_step_train_loss = drop_counter
                best_train_epoch_loss = epoch_loss
                torch.save(
                    self.model.state_dict(),
                    os.path.join(self.config.checkpoint_dir,
                                 'best_model_train.pth'))

        print("{} Training finished. -----------------------  Elapsed: {}".
              format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                     timedelta(seconds=time.time() - start_t)))
        print(
            "Best step (validation loss) = {} . ".format(best_step_valid_loss))
        print("Best step (validation roc_auc) = {} .".format(
            best_step_valid_roc))
        print("Best step (training loss) = {} .".format(best_step_train_loss))

        # Save last model
        torch.save(
            self.model.state_dict(),
            os.path.join(self.config.checkpoint_dir, 'best_model_final.pth'))
Example #15
0
    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue

                # Add grad clipping
                if group['max_grad_norm'] > 0:
                    clip_grad_norm_(p, group['max_grad_norm'])

                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError(
                        'RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(
                        p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

                state['step'] += 1
                buffered = group['buffer'][int(state['step'] % 10)]
                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2**state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 -
                                                                       beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
                        step_size = math.sqrt(
                            (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) *
                            (N_sma - 2) / N_sma * N_sma_max /
                            (N_sma_max - 2)) / (1 - beta1**state['step'])
                    elif self.degenerated_to_sgd:
                        step_size = 1.0 / (1 - beta1**state['step'])
                    else:
                        step_size = -1
                    buffered[2] = step_size

                # more conservative since it's an approximated value
                if N_sma >= 5:
                    if group['weight_decay'] != 0:
                        p_data_fp32.add_(-group['weight_decay'] * group['lr'],
                                         p_data_fp32)
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg,
                                         denom)
                    p.data.copy_(p_data_fp32)
                elif step_size > 0:
                    if group['weight_decay'] != 0:
                        p_data_fp32.add_(-group['weight_decay'] * group['lr'],
                                         p_data_fp32)
                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)
                    p.data.copy_(p_data_fp32)

        return loss
Example #16
0
def train(train_loader, model, optimizer, lr_scheduler, tb_writer):
    '''
    :param train_loader:
    :param model:
    :param optimizer:
    :param lr_scheduler:
    :param tb_writer:
    :return:
    '''
    cur_lr = lr_scheduler.get_cur_lr()  #获得当前学习率
    rank = get_rank()

    average_meter = AverageMeter()

    def is_valid_number(x):
        return not (math.isnan(x) or math.isinf(x) or x > 1e4)

    world_size = get_world_size()
    num_per_epoch = len(train_loader.dataset) // \
        cfg.TRAIN.EPOCH // (cfg.TRAIN.BATCH_SIZE * world_size)
    start_epoch = cfg.TRAIN.START_EPOCH
    epoch = start_epoch

    if not os.path.exists(cfg.TRAIN.SNAPSHOT_DIR) and \
            get_rank() == 0:
        os.makedirs(cfg.TRAIN.SNAPSHOT_DIR)

    logger.info("model\n{}".format(describe(model.module)))  #打印模型
    end = time.time()
    for idx, data in enumerate(train_loader):

        if epoch != idx // num_per_epoch + start_epoch:  #每个epoch的跳变沿进行一次模型存储
            epoch = idx // num_per_epoch + start_epoch

            if get_rank() == 0:
                torch.save(
                    {
                        'epoch': epoch,
                        'state_dict': model.module.state_dict(),
                        'optimizer': optimizer.state_dict()
                    },
                    cfg.TRAIN.SNAPSHOT_DIR + '/checkpoint_e%d.pth' % (epoch))

            if epoch == cfg.TRAIN.EPOCH:
                return

            # 如果达到第10个epoch后,则要开始微调backbone的后面3层,要重新设置一下哪些参数是可训练的,哪些参数是不动的,学习率的调整因子
            if cfg.BACKBONE.TRAIN_EPOCH == epoch:
                logger.info('start training backbone.')
                optimizer, lr_scheduler = build_opt_lr(model.module, epoch)
                logger.info("model\n{}".format(describe(model.module)))

            lr_scheduler.step(epoch)
            cur_lr = lr_scheduler.get_cur_lr()
            logger.info('epoch: {}'.format(epoch + 1))

        tb_idx = idx  #tensor board的idx
        if idx % num_per_epoch == 0 and idx != 0:
            for idx, pg in enumerate(
                    optimizer.param_groups):  #将优化器中的学习率添加到tensorboard中监视
                logger.info('epoch {} lr {}'.format(epoch + 1, pg['lr']))
                if rank == 0:
                    tb_writer.add_scalar('lr/group{}'.format(idx + 1),
                                         pg['lr'], tb_idx)

        data_time = average_reduce(time.time() - end)
        if rank == 0:
            tb_writer.add_scalar('time/data', data_time, tb_idx)

        outputs = model(data)
        loss = outputs['total_loss']

        if is_valid_number(
                loss.data.item()):  #判断损失是否是合法数据,滤掉nan,+inf,>10000的这样的损失
            optimizer.zero_grad()
            loss.backward()
            reduce_gradients(model)  #分发梯度

            if rank == 0 and cfg.TRAIN.LOG_GRADS:  #对梯度信息监视
                log_grads(model.module, tb_writer, tb_idx)

            # clip gradient
            clip_grad_norm_(model.parameters(), cfg.TRAIN.GRAD_CLIP)
            optimizer.step()

        batch_time = time.time() - end
        batch_info = {}
        batch_info['batch_time'] = average_reduce(batch_time)
        batch_info['data_time'] = average_reduce(data_time)
        for k, v in sorted(outputs.items()):
            batch_info[k] = average_reduce(v.data.item())

        average_meter.update(**batch_info)

        if rank == 0:
            for k, v in batch_info.items():
                tb_writer.add_scalar(k, v, tb_idx)

            if (idx + 1) % cfg.TRAIN.PRINT_FREQ == 0:
                info = "Epoch: [{}][{}/{}] lr: {:.6f}\n".format(
                    epoch + 1, (idx + 1) % num_per_epoch, num_per_epoch,
                    cur_lr)
                for cc, (k, v) in enumerate(batch_info.items()):
                    if cc % 2 == 0:
                        info += ("\t{:s}\t").format(getattr(average_meter, k))
                    else:
                        info += ("{:s}\n").format(getattr(average_meter, k))
                logger.info(info)
                print_speed(idx + 1 + start_epoch * num_per_epoch,
                            average_meter.batch_time.avg,
                            cfg.TRAIN.EPOCH * num_per_epoch)
        end = time.time()
    def run_train(self, train_data, dev_data):
        self.print_all_model_parameters()

        if self.optim is None:
            self.optim = optim.Adam(filter(lambda p: p.requires_grad,
                                           self.parameters()),
                                    lr=self.learning_rate)

        # Track dev metrics changes
        best_dev_metrics = 0
        dev_metrics_history = []

        for epoch_id in range(self.start_epoch, self.num_epochs):
            print('Epoch {}'.format(epoch_id))
            if self.rl_variation_tag.startswith('rs'):
                # Reward shaping module sanity check:
                #   Make sure the reward shaping module output value is in the correct range
                train_scores = self.test_fn(train_data)
                dev_scores = self.test_fn(dev_data)
                print('Train set average fact score: {}'.format(
                    float(train_scores.mean())))
                print('Dev set average fact score: {}'.format(
                    float(dev_scores.mean())))

            # Update model parameters
            self.train()
            if self.rl_variation_tag.startswith('rs'):
                self.fn.eval()
                self.fn_kg.eval()
                if self.model.endswith('hypere'):
                    self.fn_secondary_kg.eval()
            self.batch_size = self.train_batch_size
            random.shuffle(train_data)
            batch_losses = []
            entropies = []
            if self.run_analysis:
                rewards = None
                fns = None
            for example_id in tqdm(range(0, len(train_data), self.batch_size)):

                self.optim.zero_grad()

                mini_batch = train_data[example_id:example_id +
                                        self.batch_size]
                if len(mini_batch) < self.batch_size:
                    continue
                loss = self.loss(mini_batch)
                loss['model_loss'].backward()
                if self.grad_norm > 0:
                    clip_grad_norm_(self.parameters(), self.grad_norm)

                self.optim.step()

                batch_losses.append(loss['print_loss'])
                if 'entropy' in loss:
                    entropies.append(loss['entropy'])
                if self.run_analysis:
                    if rewards is None:
                        rewards = loss['reward']
                    else:
                        rewards = torch.cat([rewards, loss['reward']])
                    if fns is None:
                        fns = loss['fn']
                    else:
                        fns = torch.cat([fns, loss['fn']])
            # Check training statistics
            stdout_msg = 'Epoch {}: average training loss = {}'.format(
                epoch_id, np.mean(batch_losses))
            if entropies:
                stdout_msg += ' entropy = {}'.format(np.mean(entropies))
            print(stdout_msg)
            self.save_checkpoint(checkpoint_id=epoch_id, epoch_id=epoch_id)
            if self.run_analysis:
                print('* Analysis: # path types seen = {}'.format(
                    self.num_path_types))
                num_hits = float(rewards.sum())
                hit_ratio = num_hits / len(rewards)
                print('* Analysis: # hits = {} ({})'.format(
                    num_hits, hit_ratio))
                num_fns = float(fns.sum())
                fn_ratio = num_fns / len(fns)
                print('* Analysis: false negative ratio = {}'.format(fn_ratio))

            # Check dev set performance
            if self.run_analysis or (epoch_id > 0
                                     and epoch_id % self.num_peek_epochs == 0):
                self.eval()
                self.batch_size = self.dev_batch_size
                dev_scores = self.forward(dev_data, verbose=False)
                print('Dev set performance: (correct evaluation)')
                _, _, _, _, mrr = src.eval.hits_and_ranks(dev_data,
                                                          dev_scores,
                                                          self.kg.dev_objects,
                                                          verbose=True)
                metrics = mrr
                print('Dev set performance: (include test set labels)')
                src.eval.hits_and_ranks(dev_data,
                                        dev_scores,
                                        self.kg.all_objects,
                                        verbose=True)
                # Action dropout anneaking
                if self.model.startswith('point'):
                    eta = self.action_dropout_anneal_interval
                    if len(dev_metrics_history) > eta and metrics < min(
                            dev_metrics_history[-eta:]):
                        old_action_dropout_rate = self.action_dropout_rate
                        self.action_dropout_rate *= self.action_dropout_anneal_factor
                        print(
                            'Decreasing action dropout rate: {} -> {}'.format(
                                old_action_dropout_rate,
                                self.action_dropout_rate))
                # Save checkpoint
                if metrics > best_dev_metrics:
                    self.save_checkpoint(checkpoint_id=epoch_id,
                                         epoch_id=epoch_id,
                                         is_best=True)
                    best_dev_metrics = metrics
                    with open(
                            os.path.join(self.model_dir,
                                         'best_dev_iteration.dat'),
                            'w') as o_f:
                        o_f.write('{}'.format(epoch_id))
                else:
                    # Early stopping
                    if epoch_id >= self.num_wait_epochs and metrics < np.mean(
                            dev_metrics_history[-self.num_wait_epochs:]):
                        break
                dev_metrics_history.append(metrics)
                if self.run_analysis:
                    num_path_types_file = os.path.join(self.model_dir,
                                                       'num_path_types.dat')
                    dev_metrics_file = os.path.join(self.model_dir,
                                                    'dev_metrics.dat')
                    hit_ratio_file = os.path.join(self.model_dir,
                                                  'hit_ratio.dat')
                    fn_ratio_file = os.path.join(self.model_dir,
                                                 'fn_ratio.dat')
                    if epoch_id == 0:
                        with open(num_path_types_file, 'w') as o_f:
                            o_f.write('{}\n'.format(self.num_path_types))
                        with open(dev_metrics_file, 'w') as o_f:
                            o_f.write('{}\n'.format(metrics))
                        with open(hit_ratio_file, 'w') as o_f:
                            o_f.write('{}\n'.format(hit_ratio))
                        with open(fn_ratio_file, 'w') as o_f:
                            o_f.write('{}\n'.format(fn_ratio))
                    else:
                        with open(num_path_types_file, 'a') as o_f:
                            o_f.write('{}\n'.format(self.num_path_types))
                        with open(dev_metrics_file, 'a') as o_f:
                            o_f.write('{}\n'.format(metrics))
                        with open(hit_ratio_file, 'a') as o_f:
                            o_f.write('{}\n'.format(hit_ratio))
                        with open(fn_ratio_file, 'a') as o_f:
                            o_f.write('{}\n'.format(fn_ratio))
Example #18
0
    def train(engine, mini_batch):
        # You have to reset the gradients of all model parameters
        # before to take another step in gradient descent.
        engine.model.train()
        if engine.state.iteration % engine.config.iteration_per_update == 1 or \
            engine.config.iteration_per_update == 1:
            if engine.state.iteration > 1:
                engine.optimizer.zero_grad()

        device = next(engine.model.parameters()).device
        mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1])
        mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1])

        # Raw target variable has both BOS and EOS token.
        # The output of sequence-to-sequence does not have BOS token.
        # Thus, remove BOS token for reference.
        x, y = mini_batch.src, mini_batch.tgt[0][:, 1:]
        # |x| = (batch_size, length)
        # |y| = (batch_size, length)

        with autocast(not engine.config.off_autocast):
            # Take feed-forward
            # Similar as before, the input of decoder does not have EOS token.
            # Thus, remove EOS token for decoder input.
            y_hat = engine.model(x, mini_batch.tgt[0][:, :-1])
            # |y_hat| = (batch_size, length, output_size)

            loss = engine.crit(y_hat.contiguous().view(-1, y_hat.size(-1)),
                               y.contiguous().view(-1))
            backward_target = loss.div(y.size(0)).div(
                engine.config.iteration_per_update)

        if engine.config.gpu_id >= 0 and not engine.config.off_autocast:
            engine.scaler.scale(backward_target).backward()
        else:
            backward_target.backward()

        word_count = int(mini_batch.tgt[1].sum())
        p_norm = float(get_parameter_norm(engine.model.parameters()))
        g_norm = float(get_grad_norm(engine.model.parameters()))

        if engine.state.iteration % engine.config.iteration_per_update == 0 and \
            engine.state.iteration > 0:
            # In orther to avoid gradient exploding, we apply gradient clipping.
            torch_utils.clip_grad_norm_(
                engine.model.parameters(),
                engine.config.max_grad_norm,
            )
            # Take a step of gradient descent.
            if engine.config.gpu_id >= 0 and not engine.config.off_autocast:
                # Use scaler instead of engine.optimizer.step() if using GPU.
                engine.scaler.step(engine.optimizer)
                engine.scaler.update()
            else:
                engine.optimizer.step()

        loss = float(loss / word_count)
        ppl = np.exp(loss)

        return {
            'loss':
            loss,
            'ppl':
            ppl,
            '|param|':
            p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0.,
            '|g_param|':
            g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0.,
        }
Example #19
0
def train(model, train_loader, criterion, scheduler, optimizer, epoch, params,
          args):
    start = time.time()
    total_loss = []

    model.train()
    model.is_training = True
    model.freeze_bn()

    pbar = tqdm(train_loader, desc='==> Train', position=1)
    idx = 0
    for (images, targets) in pbar:
        images = images.to(args.device).float()
        targets = targets.to(args.device)

        if args.mixup:
            images, targets_a, targets_b, lam = mixup_data(
                images, targets, args.alpha, use_cuda=args.is_cuda)

        regression, classification, anchors = model(images)

        if args.mixup:
            cls_loss, reg_loss = mixup_criterion(images, regression,
                                                 classification, anchors,
                                                 targets_a, targets_b, lam)
        else:
            cls_loss, reg_loss = criterion(classification, regression, anchors,
                                           targets)

        # print(cls_loss, reg_loss)
        cls_loss = cls_loss.mean()
        reg_loss = reg_loss.mean()
        loss = cls_loss + reg_loss
        if loss == 0 or not torch.isfinite(loss):
            print('loss equal zero(0)')
            continue

        loss.backward()
        total_loss.append(loss.item())
        mean_loss = np.mean(total_loss)
        if (idx + 1) % args.grad_accum_steps == 0:
            clip_grad_norm_(model.parameters(), args.max_grad_norm)
            # zero grad first since first step requires zero grad beforehand
            optimizer.zero_grad()
            optimizer.step()

        iter_step(epoch, mean_loss, cls_loss, reg_loss, optimizer, params,
                  args)
        idx += 1
        pbar.update()
        pbar.set_postfix({
            'Cls_loss': cls_loss.item(),
            'Reg_loss': reg_loss.item(),
            'Mean_loss': mean_loss,
        })
        # pbar.set_description()

    # end of training epoch
    scheduler.step(mean_loss)
    # result = {'time': time.time()-start, 'loss': mean_loss}
    # for key, value in result.items():
    #     print('    {:15s}: {}'.format(str(key), value))

    return mean_loss
                log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]
                loss_policy_v = -log_prob_actions_v.mean()

                prob_v = F.softmax(logits_v, dim=1)
                entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()

                # calculate policy gradients only
                loss_policy_v.backward(retain_graph=True)
                grads = np.concatenate([p.grad.data.cpu().numpy().flatten()
                                        for p in net.parameters()
                                        if p.grad is not None])

                # apply entropy and value gradients
                loss_v = entropy_loss_v + loss_value_v
                loss_v.backward()
                nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
                optimizer.step()
                # get full loss
                loss_v += loss_policy_v

                tb_tracker.track("advantage",       adv_v, step_idx)
                tb_tracker.track("values",          value_v, step_idx)
                tb_tracker.track("batch_rewards",   vals_ref_v, step_idx)
                tb_tracker.track("loss_entropy",    entropy_loss_v, step_idx)
                tb_tracker.track("loss_policy",     loss_policy_v, step_idx)
                tb_tracker.track("loss_value",      loss_value_v, step_idx)
                tb_tracker.track("loss_total",      loss_v, step_idx)
                tb_tracker.track("grad_l2",         np.sqrt(np.mean(np.square(grads))), step_idx)
                tb_tracker.track("grad_max",        np.max(np.abs(grads)), step_idx)
                tb_tracker.track("grad_var",        np.var(grads), step_idx)
Example #21
0
def main(args):

    aa = AskingAgent(args)

    if args['user_type'] == 'oracle':
        user = NoisyUser(args)
    elif args['user_type'] == 'persona':
        user = PersonaUser(aa, args)
    else:
        print('no user type implemented')

    device = torch.device('cuda') if args['cuda'] else torch.device('cpu')
    print(device)

    writer = SummaryWriter(
        os.path.join(args['tensorboard_dir'],
                     args['comment'] + '_' + args['flavor']))
    writer.add_text('Args', args['comment'] + ' ' + str(args) + '\n')
    save_path = args['checkpoint_path']

    #==========loading data =============
    policynet = Policy(args)

    print('policy network model: ')
    print(policynet.model)
    writer.add_text('model', str(policynet.model))
    optimizer = optim.Adam(policynet.model.parameters(), lr=args['lr'])

    ftparams = []
    if args['ft_tag']:
        ftparams += [aa.tagweight, aa.tagbias, aa.lmda]
    if args['ft_emb']:
        if args['ft_rnn']:
            for m in aa.model.modules():
                if isinstance(m, nn.Dropout):
                    m.p = args['dropout']
                if isinstance(m, SRU):
                    m.dropout = args['dropout']
            ftparams += get_params(aa.model)
        else:
            ftparams += [aa.embedweight]
    if args['ft_emb'] or args['ft_tag']:
        print('Finetuning turned on ')
        nnoptimizer = optim.Adam(ftparams, lr=args['ft_lr'])
    else:
        nnoptimizer = None

    for episode in range(1, args['episodes']):
        if episode % (args['test_every']) == 0:
            batch = aa.testdata()
            mode = 'test'
            policynet.model.eval()
            aa.model.eval()

        elif episode % args['eval_every'] == 0:
            batch = aa.valdata()
            mode = 'val'
            policynet.model.eval()
            aa.model.eval()

        else:
            batch = aa.sampletrain(args['batch_size'])
            mode = 'train'
            policynet.model.train()
            aa.model.train()

        batch_s = len(batch[0])
        rank_batch, p_fx_batch, _ = infogain_rollout(batch, aa, user, args,
                                                     mode)

        action_batch = []
        logp_batch = []
        for cnt in range(1, len(p_fx_batch) + 1):
            p_f_x = p_fx_batch[cnt - 1]

            if not args['ft_tag'] and not args['ft_emb']:
                p_f_x = p_f_x.detach()

            if cnt == args['max_step']:
                action = np.zeros(batch_s)
                log_pact = torch.zeros(batch_s).to(device)
            else:
                state = policynet.get_state(p_f_x, cnt)
                action, log_pact, _ = policynet.select_action(state)

            action_batch.append(action)
            logp_batch.append(log_pact)

        rewards, logp_bs, scalars = reprocess_withmask(action_batch,
                                                       rank_batch, logp_batch,
                                                       device, args)

        if mode == 'train':
            if nnoptimizer:
                nnoptimizer.zero_grad()

            scalars = policynet.update_policy(optimizer, rewards, logp_bs,
                                              scalars)

            if nnoptimizer:
                print('fintuning')
                clip_grad_norm_(
                    [p for p in aa.model.parameters() if p.requires_grad], 3.0)
                nnoptimizer.step()

            if args['ft_tag']:
                aa.tag_inference()
                #print('w: {:.3f}, b: {:.3f}, lmd: {:.3f}'.format(aa.tagweight.item(), aa.tagbias.item(), aa.lmda.item()))
                #writer.add_scalar('tagmodel/weight', aa.tagweight.item(), episode) #*args['batch_size'])
                #writer.add_scalar('tagmodel/bias', aa.tagbias.item(), episode) #*args['batch_size'])
                writer.add_scalar('tagmodel/lmda', aa.lmda.item(),
                                  episode)  #*args['batch_size'])
                writer.add_scalar('tagmodel/weight', aa.tagweight.data.norm(),
                                  episode)  #*args['batch_size'])
                writer.add_scalar('tagmodel/bias', aa.tagbias.data.norm(),
                                  episode)  #*args['batch_size'])
            if args['ft_emb']:
                writer.add_scalar('tagmodel/embweight',
                                  aa.embedweight.data.norm(),
                                  episode)  #*args['batch_size'])
                if args['ft_rnn']:
                    writer.add_scalar('rnn-parameter/rnn_param_norm',
                                      compute_param_norm(aa.model), episode)
                    writer.add_scalar('rnn-parameter/rnn_grad_norm',
                                      compute_grad_norm(aa.model), episode)

        if writer is not None:
            for name, value in scalars:
                writer.add_scalar(mode + name, value,
                                  episode)  #*args['batch_size'])

        if episode % args['print_every'] == 0:
            print(mode)
            print('Step: {:,} '.format(episode * args['batch_size']) +
                  ' '.join([
                      '{} = {:.3f}'.format(name, value)
                      for name, value in scalars
                  ]))

        if episode % args['save_every'] == 0:
            torch.save(
                aa.state_dict(),
                args['checkpoint_dir'] + '/' + args['flavor'] + '_aa.pt')
            save_path = save_checkpoint(policynet.model,
                                        optimizer,
                                        episode,
                                        episode * args['batch_size'],
                                        dict(scalars)['/suc_rate'],
                                        args,
                                        prev_save_path=save_path)
Example #22
0
              torch.zeros(num_layers, batch_size, hidden_size).to(device))
    
    for i in range(0, ids.size(1) - seq_length, seq_length):
        # Get mini-batch inputs and targets
        inputs = ids[:, i:i+seq_length].to(device)
        targets = ids[:, (i+1):(i+1)+seq_length].to(device)
        
        # Forward pass
        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))
        
        # Backward and optimize
        model.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        step = (i+1) // seq_length
        if step % 100 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                   .format(epoch+1, num_epochs, step, num_batches, loss.item(), np.exp(loss.item())))

# Test the model
with torch.no_grad():
    with open('sample.txt', 'w') as f:
        # Set intial hidden ane cell states
        state = (torch.zeros(num_layers, 1, hidden_size).to(device),
                 torch.zeros(num_layers, 1, hidden_size).to(device))

        # Select one word id randomly
def main(gpu):
    params = configreader.get_config(
        "./common/config/hyperparams.yaml")["pong"]
    params["device"] = f"cuda:{gpu}"
    params["train_freq"] = 4
    params["batch_size"] *= params["train_freq"]
    init_logger(params)

    env = gym.make(params["env_name"])
    env = ptan.common.wrappers.wrap_dqn(env)

    net = neuralnetworks.DQN(env.observation_space.shape, env.action_space.n)
    net = net.to(params["device"])

    wandb.watch(net)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params["epsilon_start"])
    epsilon_tracker = trackers.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=params["device"])

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params["gamma"], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params["replay_size"])
    optimizer = optim.Adam(net.parameters(),
                           lr=params["learning_rate"],
                           **params["optim_params"])

    frame_idx = 0

    with trackers.RewardTracker(params["stop_reward"]) as reward_tracker:
        while True:
            frame_idx += params["train_freq"]
            buffer.populate(params["train_freq"])
            epsilon_tracker.frame(frame_idx)

            new_rewards = exp_source.pop_total_rewards()

            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx,
                                         selector.epsilon):
                    break

            if len(buffer) < params["replay_initial"]:
                continue

            optimizer.zero_grad()
            batch = buffer.sample(params["batch_size"])
            loss = losses.calc_loss_dqn(
                batch,
                net,
                tgt_net.target_model,
                gamma=params["gamma"],
                device=params["device"],
            )
            loss.backward()
            clip_grad_norm_(net.parameters(), params["gradient_clip"])
            optimizer.step()

            if frame_idx % params["target_net_sync"] < params["train_freq"]:
                tgt_net.sync()
Example #24
0
def train(model,
          dataset,
          model_dir,
          summary_writer,
          epochs,
          lr,
          conf_thres,
          nms_thres,
          iou_thres,
          lambda_coord=5,
          lambda_no_obj=0.5,
          gradient_accumulations=2,
          clip_gradients=False,
          limit=None,
          debug=False,
          print_every=10,
          save_every=None,
          log_to_neptune=False):
    if log_to_neptune:
        env_path = Path(os.environ['HOME'], 'workspace/setup-box/neptune.env')
        load_dotenv(dotenv_path=env_path)

        neptune.init('petersiemen/sandbox',
                     api_token=os.getenv("NEPTUNE_API_TOKEN"))

    total = limit if limit is not None else len(dataset)

    logger.info(
        f'Start training on {total} images. Using lr: {lr}, '
        f'lambda_coord: {lambda_coord}, lambda_no_obj: {lambda_no_obj}, '
        f'conf_thres: {conf_thres}, nms_thres:{nms_thres}, iou_thres: {iou_thres}, '
        f'gradient_accumulations: {gradient_accumulations}, '
        f'clip_gradients: {clip_gradients}, lambda_no_obj: {lambda_no_obj}')
    metrics = Metrics()

    model.to(DEVICE)
    model.train()

    optimizer = torch.optim.Adam(model.get_trainable_parameters(), lr=lr)
    grid_sizes = model.grid_sizes

    data_loader = DataLoader(dataset,
                             batch_size=dataset.batch_size,
                             shuffle=True,
                             collate_fn=dataset.collate_fn)
    class_names = model.class_names

    for epoch in range(1, epochs + 1):
        for batch_i, (images, ground_truth_boxes,
                      image_paths) in tqdm(enumerate(data_loader),
                                           total=total):
            if len(images) != dataset.batch_size:
                logger.warning(
                    f"Skipping batch {batch_i} because it does not have correct size ({dataset.batch_size})"
                )
                continue

            images = images.to(DEVICE)

            coordinates, class_scores, confidence = model(images)

            obj_mask, noobj_mask, cls_mask, target_coordinates, target_confidence, target_class_scores = build_targets(
                coordinates, class_scores, ground_truth_boxes, grid_sizes)
            yolo_loss = YoloLoss(coordinates,
                                 confidence,
                                 class_scores,
                                 obj_mask,
                                 noobj_mask,
                                 cls_mask,
                                 target_coordinates,
                                 target_confidence,
                                 target_class_scores,
                                 lambda_coord=lambda_coord,
                                 lambda_no_obj=lambda_no_obj)

            class_scores = torch.sigmoid(class_scores)
            prediction = torch.cat(
                (coordinates, confidence.unsqueeze(-1), class_scores), -1)

            detections = non_max_suppression(prediction=prediction,
                                             conf_thres=conf_thres,
                                             nms_thres=nms_thres)

            ground_truth_map_objects = list(
                GroundTruth.from_ground_truths(image_paths,
                                               ground_truth_boxes))
            detection_map_objects = list(
                Detection.from_detections(image_paths, detections))

            metrics.add_detections_for_batch(detection_map_objects,
                                             ground_truth_map_objects,
                                             iou_thres=iou_thres)

            if debug:
                plot_batch(detections, ground_truth_boxes, images, class_names)

            loss = yolo_loss.get()
            # backward pass to calculate the weight gradients
            loss.backward()

            if clip_gradients:
                logger.debug("Clipping gradients with max_norm = 1")
                clip_grad_norm_(model.parameters(), max_norm=1)

            if batch_i % print_every == 0:  # print every print_every +1  batches
                yolo_loss.capture(summary_writer, batch_i, during='train')
                #plot_weights_and_gradients(model, summary_writer, epoch * batch_i)
                log_performance(epoch, epochs, batch_i, total, yolo_loss,
                                metrics, class_names, summary_writer,
                                log_to_neptune)

            # Accumulates gradient before each step
            if batch_i % gradient_accumulations == 0:
                logger.debug(
                    f"Updating weights for batch {batch_i} (gradient_accumulations :{gradient_accumulations})"
                )
                # update the weights
                optimizer.step()
                # zero the parameter (weight) gradients
                optimizer.zero_grad()

            del images
            del ground_truth_boxes

            if limit is not None and batch_i + 1 >= limit:
                logger.info(
                    'Stop here after training {} batches (limit: {})'.format(
                        batch_i, limit))
                log_performance(epoch, epochs, batch_i, total, yolo_loss,
                                metrics, class_names, summary_writer,
                                log_to_neptune)
                save_model(model_dir, model, epoch, batch_i)
                return

            if save_every is not None and batch_i % save_every == 0:
                save_model(model_dir, model, epoch, batch_i)

        # save model after every epoch
        save_model(model_dir, model, epoch, None)
Example #25
0
def f_step(config,
           vocab,
           model_F,
           model_D,
           optimizer_F,
           batch,
           temperature,
           drop_decay,
           cyc_rec_enable=True):
    model_D.eval()

    pad_idx = vocab.stoi['<pad>']
    eos_idx = vocab.stoi['<eos>']
    unk_idx = vocab.stoi['<unk>']
    vocab_size = len(vocab)
    loss_fn = nn.NLLLoss(reduction='none')

    inp_tokens, inp_lengths, raw_styles = batch_preprocess(
        batch, pad_idx, eos_idx)
    rev_styles = 1 - raw_styles
    batch_size = inp_tokens.size(0)
    token_mask = (inp_tokens != pad_idx).float()

    optimizer_F.zero_grad()

    # self reconstruction loss

    noise_inp_tokens = word_dropout(  #word_drop(
        inp_tokens,
        inp_lengths,
        config.inp_drop_prob * drop_decay,
        unk_idx  #vocab
    )
    noise_inp_lengths = get_lengths(noise_inp_tokens, eos_idx)

    slf_log_probs = model_F(
        noise_inp_tokens,
        inp_tokens,
        noise_inp_lengths,
        raw_styles,
        generate=False,
        differentiable_decode=False,
        temperature=temperature,
    )

    slf_rec_loss = loss_fn(slf_log_probs.transpose(1, 2),
                           inp_tokens) * token_mask
    slf_rec_loss = slf_rec_loss.sum() / batch_size
    slf_rec_loss *= config.slf_factor

    slf_rec_loss.backward()

    # cycle consistency loss

    if not cyc_rec_enable:
        optimizer_F.step()
        model_D.train()
        return slf_rec_loss.item(), 0, 0

    gen_log_probs = model_F(
        inp_tokens,
        None,
        inp_lengths,
        rev_styles,
        generate=True,
        differentiable_decode=True,
        temperature=temperature,
    )

    gen_soft_tokens = gen_log_probs.exp()
    gen_lengths = get_lengths(gen_soft_tokens.argmax(-1), eos_idx)

    cyc_log_probs = model_F(
        gen_soft_tokens,
        inp_tokens,
        gen_lengths,
        raw_styles,
        generate=False,
        differentiable_decode=False,
        temperature=temperature,
    )

    cyc_rec_loss = loss_fn(cyc_log_probs.transpose(1, 2),
                           inp_tokens) * token_mask
    cyc_rec_loss = cyc_rec_loss.sum() / batch_size
    cyc_rec_loss *= config.cyc_factor

    # style consistency loss

    adv_log_porbs = model_D(gen_soft_tokens, gen_lengths, rev_styles)
    if config.discriminator_method == 'Multi':
        adv_labels = rev_styles + 1
    else:
        adv_labels = torch.ones_like(rev_styles)
    adv_loss = loss_fn(adv_log_porbs, adv_labels)
    adv_loss = adv_loss.sum() / batch_size
    adv_loss *= config.adv_factor

    (cyc_rec_loss + adv_loss).backward()

    # update parameters

    clip_grad_norm_(model_F.parameters(), 5)
    optimizer_F.step()

    model_D.train()

    return slf_rec_loss.item(), cyc_rec_loss.item(), adv_loss.item()
Example #26
0
    def train(self):
        self.model.cuda().train()
        for iter_idx in range(self.start_iter, self.max_iter):
            self.cur_epoch = int(float(iter_idx + 1) / self.epoch_iters)
            self.cur_iter = iter_idx
            inputs = self.get_batch('train')
            loss = self.forward(inputs)

            self.backward(loss)
            if self.config.trainer.clip_gradient > 0:
                clip_grad_norm_(self.model.parameters(),
                                self.config.trainer.clip_gradient)
            self.update()

            if iter_idx % self.config.trainer.print_freq == 0 and self.rank == 0:
                self.tb_logger.add_scalar('loss_train',
                                          self.metrics['losses'].avg, iter_idx)
                self.tb_logger.add_scalar('lr',
                                          self.lr_scheduler.get_lr()[0],
                                          iter_idx)
                log_formatter = get_log_format(self.multi_class)
                if self.multi_class:
                    self.tb_logger.add_scalar('mAP_train',
                                              self.metrics['mAP'].avg,
                                              iter_idx)
                    self.logger.info(
                        log_formatter.format(
                            iter_idx,
                            self.max_iter,
                            self.cur_epoch + 1,
                            self.config.trainer.epochs,
                            batch_time=self.metrics['batch_time'],
                            data_time=self.metrics['data_time'],
                            loss=self.metrics['losses'],
                            mAP=self.metrics['mAP'],
                            lr=self.lr_scheduler.get_lr()[0]))
                else:
                    self.tb_logger.add_scalar('acc1_train',
                                              self.metrics['top1'].avg,
                                              iter_idx)
                    self.tb_logger.add_scalar('acc5_train',
                                              self.metrics['top5'].avg,
                                              iter_idx)
                    self.logger.info(
                        log_formatter.format(
                            iter_idx,
                            self.max_iter,
                            self.cur_epoch + 1,
                            self.config.trainer.epochs,
                            batch_time=self.metrics['batch_time'],
                            data_time=self.metrics['data_time'],
                            loss=self.metrics['losses'],
                            top1=self.metrics['top1'],
                            top5=self.metrics['top5'],
                            lr=self.lr_scheduler.get_lr()[0]))

            if (iter_idx == self.max_iter - 1) or (iter_idx % self.epoch_iters == 0 and iter_idx > 0 and \
                    self.cur_epoch % self.config.trainer.eval_freq == 0):
                metric = self.evaluate()

                if self.rank == 0 and self.tb_logger is not None:
                    self.tb_logger.add_scalar('loss_val', metric.loss,
                                              iter_idx)
                    if self.multi_class:
                        self.tb_logger.add_scalar('mAP_val', metric.top1,
                                                  iter_idx)
                    else:
                        self.tb_logger.add_scalar('acc1_val', metric.top1,
                                                  iter_idx)
                        self.tb_logger.add_scalar('acc5_val', metric.top5,
                                                  iter_idx)

                if self.rank == 0:
                    # remember best prec@1 and save checkpoint
                    is_best = metric.top1 > self.best_prec1
                    self.best_prec1 = max(metric.top1, self.best_prec1)
                    self.save_checkpoint(
                        {
                            'epoch': self.cur_epoch,
                            'optimizer': self.optimizer.state_dict(),
                            'model': self.model.state_dict(),
                            'lr_scheduler': self.lr_scheduler.state_dict(),
                            'best_prec1': self.best_prec1
                        }, is_best)

                if self.multi_class:
                    self.logger.info(' * Best mAP {:.3f}'.format(
                        self.best_prec1))
                else:
                    self.logger.info(' * Best Prec@1 {:.3f}'.format(
                        self.best_prec1))

            end = time.time()
Example #27
0
def d_step(config, vocab, model_F, model_D, optimizer_D, batch, temperature):
    model_F.eval()
    pad_idx = vocab.stoi['<pad>']
    eos_idx = vocab.stoi['<eos>']
    vocab_size = len(vocab)
    loss_fn = nn.NLLLoss(reduction='none')

    inp_tokens, inp_lengths, raw_styles = batch_preprocess(
        batch, pad_idx, eos_idx)
    rev_styles = 1 - raw_styles
    batch_size = inp_tokens.size(0)

    with torch.no_grad():
        raw_gen_log_probs = model_F(
            inp_tokens,
            None,
            inp_lengths,
            raw_styles,
            generate=True,
            differentiable_decode=True,
            temperature=temperature,
        )
        rev_gen_log_probs = model_F(
            inp_tokens,
            None,
            inp_lengths,
            rev_styles,
            generate=True,
            differentiable_decode=True,
            temperature=temperature,
        )

    raw_gen_soft_tokens = raw_gen_log_probs.exp()
    raw_gen_lengths = get_lengths(raw_gen_soft_tokens.argmax(-1), eos_idx)

    rev_gen_soft_tokens = rev_gen_log_probs.exp()
    rev_gen_lengths = get_lengths(rev_gen_soft_tokens.argmax(-1), eos_idx)

    if config.discriminator_method == 'Multi':
        gold_log_probs = model_D(inp_tokens, inp_lengths)
        gold_labels = raw_styles + 1

        raw_gen_log_probs = model_D(raw_gen_soft_tokens, raw_gen_lengths)
        rev_gen_log_probs = model_D(rev_gen_soft_tokens, rev_gen_lengths)
        gen_log_probs = torch.cat((raw_gen_log_probs, rev_gen_log_probs), 0)
        raw_gen_labels = raw_styles + 1
        rev_gen_labels = torch.zeros_like(rev_styles)
        gen_labels = torch.cat((raw_gen_labels, rev_gen_labels), 0)
    else:
        raw_gold_log_probs = model_D(inp_tokens, inp_lengths, raw_styles)
        rev_gold_log_probs = model_D(inp_tokens, inp_lengths, rev_styles)
        gold_log_probs = torch.cat((raw_gold_log_probs, rev_gold_log_probs), 0)
        raw_gold_labels = torch.ones_like(raw_styles)
        rev_gold_labels = torch.zeros_like(rev_styles)
        gold_labels = torch.cat((raw_gold_labels, rev_gold_labels), 0)

        raw_gen_log_probs = model_D(raw_gen_soft_tokens, raw_gen_lengths,
                                    raw_styles)
        rev_gen_log_probs = model_D(rev_gen_soft_tokens, rev_gen_lengths,
                                    rev_styles)
        gen_log_probs = torch.cat((raw_gen_log_probs, rev_gen_log_probs), 0)
        raw_gen_labels = torch.ones_like(raw_styles)
        rev_gen_labels = torch.zeros_like(rev_styles)
        gen_labels = torch.cat((raw_gen_labels, rev_gen_labels), 0)

    adv_log_probs = torch.cat((gold_log_probs, gen_log_probs), 0)
    adv_labels = torch.cat((gold_labels, gen_labels), 0)
    adv_loss = loss_fn(adv_log_probs, adv_labels)
    assert len(adv_loss.size()) == 1
    adv_loss = adv_loss.sum() / batch_size
    loss = adv_loss

    optimizer_D.zero_grad()
    loss.backward()
    clip_grad_norm_(model_D.parameters(), 5)
    optimizer_D.step()

    model_F.train()

    return adv_loss.item()
Example #28
0
    def train_epoch(self,
                    train,
                    optimizer,
                    verbose=VERBOSE_BATCH_WISE
                    ):
        '''
        Train an epoch with given train iterator and optimizer.
        '''
        total_loss, total_word_count = 0, 0
        total_grad_norm = 0
        avg_loss, avg_grad_norm = 0, 0
        sample_cnt = 0

        if verbose == VERBOSE_BATCH_WISE:
            print(optimizer)

        progress_bar = tqdm(train,
                            desc='Training: ',
                            unit='batch'
                            ) if verbose is VERBOSE_BATCH_WISE else train
        # Iterate whole train-set.
        for idx, mini_batch in enumerate(progress_bar):
            # Raw target variable has both BOS and EOS token. 
            # The output of sequence-to-sequence does not have BOS token. 
            # Thus, remove BOS token for reference.
            x, y = mini_batch.src, mini_batch.tgt[0][:, 1:]
            # |x| = (batch_size, length)
            # |y| = (batch_size, length)

            # You have to reset the gradients of all model parameters before to take another step in gradient descent.
            optimizer.zero_grad()

            # Take feed-forward
            # Similar as before, the input of decoder does not have EOS token.
            # Thus, remove EOS token for decoder input.
            y_hat = self.model(x, mini_batch.tgt[0][:, :-1])
            # |y_hat| = (batch_size, length, output_size)

            # Calcuate loss and gradients with back-propagation.
            loss = self._get_loss(y_hat, y)
            loss.div(y.size(0)).backward()

            # Simple math to show stats.
            # Don't forget to detach final variables.
            total_loss += float(loss)
            total_word_count += int(mini_batch.tgt[1].sum())
            param_norm = float(utils.get_parameter_norm(self.model.parameters()))
            total_grad_norm += float(utils.get_grad_norm(self.model.parameters()))

            avg_loss = total_loss / total_word_count
            avg_grad_norm = total_grad_norm / (idx + 1)

            if verbose is VERBOSE_BATCH_WISE:
                progress_bar.set_postfix_str('|param|=%.2f |g_param|=%.2f loss=%.4e PPL=%.2f' % (param_norm,
                                                                                                 avg_grad_norm,
                                                                                                 avg_loss,
                                                                                                 exp(avg_loss)
                                                                                                 ))

            # In orther to avoid gradient exploding, we apply gradient clipping.
            torch_utils.clip_grad_norm_(self.model.parameters(),
                                        self.config.max_grad_norm
                                        )
            # Take a step of gradient descent.
            optimizer.step()

            sample_cnt += mini_batch.tgt[0].size(0)

            if idx >= len(progress_bar) * self.config.train_ratio_per_epoch:
                break

        if verbose is VERBOSE_BATCH_WISE:
            progress_bar.close()

        return avg_loss, param_norm, avg_grad_norm
Example #29
0
                           lr=params["learning_rate"],
                           eps=1e-3)

    batch = []

    # TRAINING
    with logger.RewardTracker(net, writer, stop_reward=195,
                              tag="a2c") as tracker:
        for step_idx, exp in enumerate(exp_source):
            batch.append(exp)

            # handle new rewards
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if tracker.reward(new_rewards[0], step_idx):
                    break

            if len(batch) < params["batch_size"]:
                continue

            loss_policy, loss_v = calc_a2c_loss(batch, net, params)
            batch.clear()

            optimizer.zero_grad()

            loss_policy.backward(retain_graph=True)
            loss_v.backward()

            nn_utils.clip_grad_norm_(net.parameters(), params["grad_clip"])
            optimizer.step()
Example #30
0
for epoch in range(num_epochs):
    # Set initial hidden and cell states
    states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
              torch.zeros(num_layers, batch_size, hidden_size).to(device))

    for i in range(0, ids.size(1) - seq_length, seq_length):
        inputs = ids[:, i:i + seq_length].to(device)
        targets = ids[:, (i + 1):(i + 1 + seq_length)].to(device)

        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))

        model.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        step = (i + 1) // seq_length
        if step % 100 == 0:
            if step % 100 == 0:
                print(
                    'Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                    .format(epoch + 1, num_epochs, step, num_batches,
                            loss.item(), np.exp(loss.item())))

# Test the model
with torch.no_grad():
    with open('sample.txt', 'w') as f:
        # Set intial hidden ane cell states
        state = (torch.zeros(num_layers, 1, hidden_size).to(device),
Example #31
0
valid_criterion = ValidLoss()
optim = Adam(model.parameters(), lr=lr, weight_decay=1e-6)
scheduler = StepLR(optim, step_size=5, gamma=0.1)

for e in range(epochs):
    model.train()
    for i, (*inputs, score) in enumerate(dataloader):
        optim.zero_grad()
        inputs = [input_.to(device) for input_ in inputs]
        score = score.to(device).contiguous().view(-1)

        predicted = model(*inputs)

        loss = criterion(predicted, score)
        loss.backward()
        clip_grad_norm_(model.parameters(), 5.0)
        optim.step()

    valid_loss = 0.0
    acc = 0.0
    model.eval()
    total_num = 0
    for *inputs, score in valid_dataloader:
        batch_size = score.size(0)
        inputs = [input_.to(device) for input_ in inputs]
        score = score.to(device).contiguous().view(-1)
        predicted = model(*inputs)
        valid_loss += valid_criterion(predicted, score).item() * batch_size
        predicted = predicted.argmax(dim=1)
        acc += torch.eq(predicted, score).sum()
        total_num += batch_size
Example #32
0
    def iterate(self, src_tuple, target_tuple, training=True):
        # limit number of tokens o avoid gpu overload
        if self.limit_num_tokens is not None:
            src_tuple, target_tuple = self._batch_limit_tokens(
                src_tuple, target_tuple)
        src, src_length = src_tuple
        target, target_length = target_tuple
        batch_dim, time_dim = (0, 1) if self.batch_first else (1, 0)
        num_words = sum(target_length) - target.size(batch_dim)

        if isinstance(src, PackedSequence) or \
                not isinstance(self.model_with_loss, DataParallel):
            if isinstance(src, PackedSequence):
                src = PackedSequence(src.data.to(self.device),
                                     src.batch_sizes.to(self.device))
            else:
                src = src.to(self.device)
            target = target.to(self.device)

        if self.batch_first:
            inputs = (src, target[:, :-1])
            target_labels = target[:, 1:].contiguous()
        else:
            inputs = (src, target[:-1])
            target_labels = target[1:]

        # compute output
        loss, accuracy = self.model_with_loss(inputs, target_labels)

        loss = loss.sum()
        loss_measure = float(loss / num_words)
        if self.avg_loss_time:
            loss /= num_words
        else:
            loss /= target.size(batch_dim)
        accuracy = float(accuracy.sum().float() / num_words)

        if training:
            # compute gradient and do SGD step
            self.optimizer.zero_grad()
            loss.backward()
            if self.grad_clip is not None:
                if isinstance(self.grad_clip, dict):
                    clip_encoder = self.grad_clip.get('encoder', 0)
                    clip_decoder = self.grad_clip.get('decoder', 0)
                    if clip_encoder > 0:
                        clip_grad_norm_(
                            self.model.encoder.parameters(), clip_encoder)
                    if clip_decoder > 0:
                        clip_grad_norm_(
                            self.model.decoder.parameters(), clip_decoder)
                elif self.grad_clip > 0:  # grad_clip is a number
                    clip_grad_norm_(self.model.parameters(), self.grad_clip)
            if self.embedding_grad_clip is not None and self.embedding_grad_clip > 0:
                if hasattr(self.model.encoder, 'embedder'):
                    clip_grad_norm_(self.model.encoder.embedder.parameters(),
                                    self.embedding_grad_clip)
                if hasattr(self.model.decoder, 'embedder'):
                    clip_grad_norm_(self.model.decoder.embedder.parameters(),
                                    self.embedding_grad_clip)
            self.optimizer.step()
        return loss_measure, accuracy, num_words
Example #33
0
    def train(
        self,
        training_batch_size: int = 50,
        learning_rate: float = 5e-4,
        validation_fraction: float = 0.1,
        stop_after_epochs: int = 20,
        max_num_epochs: Optional[int] = None,
        clip_max_norm: Optional[float] = 5.0,
        calibration_kernel: Optional[Callable] = None,
        exclude_invalid_x: bool = True,
        resume_training: bool = False,
        discard_prior_samples: bool = False,
        retrain_from_scratch_each_round: bool = False,
        show_train_summary: bool = False,
        dataloader_kwargs: Optional[dict] = None,
    ) -> DirectPosterior:
        r"""
        Return density estimator that approximates the distribution $p(\theta|x)$.

        Args:
            training_batch_size: Training batch size.
            learning_rate: Learning rate for Adam optimizer.
            validation_fraction: The fraction of data to use for validation.
            stop_after_epochs: The number of epochs to wait for improvement on the
                validation set before terminating training.
            max_num_epochs: Maximum number of epochs to run. If reached, we stop
                training even when the validation loss is still decreasing. If None, we
                train until validation loss increases (see also `stop_after_epochs`).
            clip_max_norm: Value at which to clip the total gradient norm in order to
                prevent exploding gradients. Use None for no clipping.
            calibration_kernel: A function to calibrate the loss with respect to the
                simulations `x`. See Lueckmann, Gonçalves et al., NeurIPS 2017.
            exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞`
                during training. Expect errors, silent or explicit, when `False`.
            resume_training: Can be used in case training time is limited, e.g. on a
                cluster. If `True`, the split between train and validation set, the
                optimizer, the number of epochs, and the best validation log-prob will
                be restored from the last time `.train()` was called.
            discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
                from the prior. Training may be sped up by ignoring such less targeted
                samples.
            retrain_from_scratch_each_round: Whether to retrain the conditional density
                estimator for the posterior from scratch each round.
            show_train_summary: Whether to print the number of epochs and validation
                loss after the training.
            dataloader_kwargs: Additional or updated kwargs to be passed to the training
                and validation dataloaders (like, e.g., a collate_fn)

        Returns:
            Density estimator that approximates the distribution $p(\theta|x)$.
        """

        # Calibration kernels proposed in Lueckmann, Gonçalves et al., 2017.
        if calibration_kernel is None:
            calibration_kernel = lambda x: ones([len(x)], device=self._device)

        max_num_epochs = 2**31 - 1 if max_num_epochs is None else max_num_epochs

        # Starting index for the training set (1 = discard round-0 samples).
        start_idx = int(discard_prior_samples and self._round > 0)

        # For non-atomic loss, we can not reuse samples from previous rounds as of now.
        # SNPE-A can, by construction of the algorithm, only use samples from the last
        # round. SNPE-A is the only algorithm that has an attribute `_ran_final_round`,
        # so this is how we check for whether or not we are using SNPE-A.
        if self.use_non_atomic_loss or hasattr(self, "_ran_final_round"):
            start_idx = self._round

        theta, x, prior_masks = self.get_simulations(start_idx,
                                                     exclude_invalid_x,
                                                     warn_on_invalid=True)

        # Dataset is shared for training and validation loaders.
        dataset = data.TensorDataset(
            theta,
            x,
            prior_masks,
        )

        # Set the proposal to the last proposal that was passed by the user. For
        # atomic SNPE, it does not matter what the proposal is. For non-atomic
        # SNPE, we only use the latest data that was passed, i.e. the one from the
        # last proposal.
        proposal = self._proposal_roundwise[-1]

        train_loader, val_loader = self.get_dataloaders(
            dataset,
            training_batch_size,
            validation_fraction,
            resume_training,
            dataloader_kwargs=dataloader_kwargs,
        )

        # First round or if retraining from scratch:
        # Call the `self._build_neural_net` with the rounds' thetas and xs as
        # arguments, which will build the neural network.
        # This is passed into NeuralPosterior, to create a neural posterior which
        # can `sample()` and `log_prob()`. The network is accessible via `.net`.
        if self._neural_net is None or retrain_from_scratch_each_round:
            self._neural_net = self._build_neural_net(
                theta[self.train_indices], x[self.train_indices])
            # If data on training device already move net as well.
            if (not self._device == "cpu"
                    and f"{x.device.type}:{x.device.index}" == self._device):
                self._neural_net.to(self._device)

            test_posterior_net_for_multi_d_x(self._neural_net, theta, x)
            self._x_shape = x_shape_from_simulation(x)

        # Move entire net to device for training.
        self._neural_net.to(self._device)

        if not resume_training:
            self.optimizer = optim.Adam(
                list(self._neural_net.parameters()),
                lr=learning_rate,
            )
            self.epoch, self._val_log_prob = 0, float("-Inf")

        while self.epoch <= max_num_epochs and not self._converged(
                self.epoch, stop_after_epochs):

            # Train for a single epoch.
            self._neural_net.train()
            train_log_prob_sum = 0
            epoch_start_time = time.time()
            for batch in train_loader:
                self.optimizer.zero_grad()
                # Get batches on current device.
                theta_batch, x_batch, masks_batch = (
                    batch[0].to(self._device),
                    batch[1].to(self._device),
                    batch[2].to(self._device),
                )

                batch_loss = torch.mean(
                    self._loss(
                        theta_batch,
                        x_batch,
                        masks_batch,
                        proposal,
                        calibration_kernel,
                    ))

                train_log_prob_sum += batch_loss.sum().item()

                batch_loss.backward()
                if clip_max_norm is not None:
                    clip_grad_norm_(
                        self._neural_net.parameters(),
                        max_norm=clip_max_norm,
                    )
                self.optimizer.step()

            self.epoch += 1

            train_log_prob_sum /= int(theta.shape[0] *
                                      (1.0 - validation_fraction))
            self._summary["train_log_probs"].append(train_log_prob_sum)

            # Calculate validation performance.
            self._neural_net.eval()
            log_prob_sum = 0

            with torch.no_grad():
                for batch in val_loader:
                    theta_batch, x_batch, masks_batch = (
                        batch[0].to(self._device),
                        batch[1].to(self._device),
                        batch[2].to(self._device),
                    )
                    # Take negative loss here to get validation log_prob.
                    batch_log_prob = -self._loss(
                        theta_batch,
                        x_batch,
                        masks_batch,
                        proposal,
                        calibration_kernel,
                    )
                    log_prob_sum += batch_log_prob.sum().item()

            # Take mean over all validation samples.
            self._val_log_prob = log_prob_sum / (len(val_loader) *
                                                 val_loader.batch_size)
            # Log validation log prob for every epoch.
            self._summary["validation_log_probs"].append(self._val_log_prob)
            self._summary["epoch_durations_sec"].append(time.time() -
                                                        epoch_start_time)

            self._maybe_show_progress(self._show_progress_bars, self.epoch)

        self._report_convergence_at_end(self.epoch, stop_after_epochs,
                                        max_num_epochs)

        # Update summary.
        self._summary["epochs"].append(self.epoch)
        self._summary["best_validation_log_probs"].append(
            self._best_val_log_prob)

        # Update tensorboard and summary dict.
        self._summarize(
            round_=self._round,
            x_o=None,
            theta_bank=theta,
            x_bank=x,
        )

        # Update description for progress bar.
        if show_train_summary:
            print(self._describe_round(self._round, self._summary))

        return deepcopy(self._neural_net)
            scale_std = np.std(batch_scales)
            batch_scale_v = torch.FloatTensor(batch_scales).to(device)

            optimizer.zero_grad()
            logits_v = net(states_v)
            log_prob_v = F.log_softmax(logits_v, dim=1)
            log_prob_actions_v = batch_scale_v * log_prob_v[range(BATCH_SIZE), batch_actions_t]
            loss_policy_v = -log_prob_actions_v.mean()

            prob_v = F.softmax(logits_v, dim=1)
            entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
            entropy_loss_v = -ENTROPY_BETA * entropy_v
            loss_v = loss_policy_v + entropy_loss_v
            loss_v.backward()
            nn_utils.clip_grad_norm_(net.parameters(), GRAD_L2_CLIP)
            optimizer.step()

            # calc KL-div
            new_logits_v = net(states_v)
            new_prob_v = F.softmax(new_logits_v, dim=1)
            kl_div_v = -((new_prob_v / prob_v).log() * prob_v).sum(dim=1).mean()
            writer.add_scalar("kl", kl_div_v.item(), step_idx)

            grad_max = 0.0
            grad_means = 0.0
            grad_count = 0
            for p in net.parameters():
                grad_max = max(grad_max, p.grad.abs().max().item())
                grad_means += (p.grad ** 2).mean().sqrt().item()
                grad_count += 1
Example #35
0
def main(args):
    args.color_t = torch.rand(700, 3)

    if not os.path.exists(args.ckpt_dir):
        os.makedirs(args.ckpt_dir)
    if not os.path.exists(args.summary_dir):
        os.makedirs(args.summary_dir)

    device = torch.device(
       "cuda" if not args.nocuda and torch.cuda.is_available() else "cpu")

    model = SCALOR(args)
    model.to(device)
    model.train()

    optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr)
    global_step = 0

    if args.last_ckpt:
        global_step, args.start_epoch = \
            load_ckpt(model, optimizer, args.last_ckpt, device)

    writer = SummaryWriter(args.summary_dir)

    args.global_step = global_step

    log_tau_gamma = np.log(args.tau_end) / args.tau_ep


    D = torch.load(args.experience_replay)
    num_train = D.size

    for epoch in range(int(args.start_epoch), args.epochs):
        local_count = 0
        last_count = 0
        end_time = time.time()


        for _ in range(num_train // args.batch_size):

            chunk_size = epoch + 2
            chunk_size = min(chunk_size, args.chunk_size)

            observations, actions, rewards, nonterminals = D.sample(args.batch_size, chunk_size)
            
            tau = np.exp(global_step * log_tau_gamma)
            tau = max(tau, args.tau_end)
            args.tau = tau

            global_step += 1

            log_phase = global_step % args.print_freq == 0 or global_step == 1
            args.global_step = global_step
            args.log_phase = log_phase

            if np.random.binomial(1, min(epoch, 10)/10, 1)[0] and not log_phase:
                args.phase_generate = True
            else:
                args.phase_generate = False

            sample = observations[:,:,0:3].permute(1,0,2,3,4) / 255
            actions = actions.permute(1,0,2)

            imgs = sample.to(device)
            actions = actions.to(device)

            y_seq, log_like, kl_z_what, kl_z_where, kl_z_depth, \
            kl_z_pres, kl_z_bg, kl_edge_type, log_imp, counting, \
            log_disc_list, log_prop_list, scalor_log_list = model(imgs, actions)


            log_like = log_like.mean(dim=0)
            kl_z_what = kl_z_what.mean(dim=0)
            kl_z_where = kl_z_where.mean(dim=0)
            kl_z_depth = kl_z_depth.mean(dim=0)
            kl_z_pres = kl_z_pres.mean(dim=0)
            kl_z_bg = kl_z_bg.mean(0)
            kl_edge_type = kl_edge_type.mean(0)

            total_loss = - log_like + kl_z_what + kl_z_where + kl_z_depth + kl_z_pres + kl_z_bg + kl_edge_type

            optimizer.zero_grad()
            total_loss.backward()

            clip_grad_norm_(model.parameters(), args.cp)
            optimizer.step()

            local_count += imgs.data.shape[0]

            if log_phase:

                time_inter = time.time() - end_time
                end_time = time.time()

                count_inter = local_count - last_count

                print_scalor(global_step, epoch, local_count, count_inter,\
                               num_train, total_loss, log_like, kl_z_what, kl_z_where,\
                               kl_z_pres, kl_z_depth, time_inter)

                writer.add_scalar('train/total_loss', total_loss.item(), global_step=global_step)
                writer.add_scalar('train/log_like', log_like.item(), global_step=global_step)
                writer.add_scalar('train/What_KL', kl_z_what.item(), global_step=global_step)
                writer.add_scalar('train/Where_KL', kl_z_where.item(), global_step=global_step)
                writer.add_scalar('train/Pres_KL', kl_z_pres.item(), global_step=global_step)
                writer.add_scalar('train/Depth_KL', kl_z_depth.item(), global_step=global_step)
                writer.add_scalar('train/Bg_KL', kl_z_bg.item(), global_step=global_step)
                writer.add_scalar('train/Edge_KL', kl_edge_type.item(), global_step=global_step)
                # writer.add_scalar('train/Bg_alpha_KL', kl_z_bg_mask.item(), global_step=global_step)
                writer.add_scalar('train/tau', tau, global_step=global_step)

                log_summary(args, writer, imgs, y_seq, global_step, log_disc_list,
                            log_prop_list, scalor_log_list, prefix='train')

                last_count = local_count

                #print(args.generate_freq)
                #args.generate_freq = 2
                #if global_step % args.generate_freq == 0:
                ####################################### do generation ####################################
                model.eval()
                with torch.no_grad():
                    args.phase_generate = True
                    y_seq, log_like, kl_z_what, kl_z_where, kl_z_depth, \
                    kl_z_pres, kl_z_bg, kl_edge_type, log_imp, counting, \
                    log_disc_list, log_prop_list, scalor_log_list = model(imgs, actions)
                    args.phase_generate = False
                    
                    log_summary(args, writer, imgs, y_seq, global_step, log_disc_list,
                                log_prop_list, scalor_log_list, prefix='generate')
                model.train()
                ####################################### end generation ####################################

            if global_step % args.save_epoch_freq == 0 or global_step == 1:
                save_ckpt(args.ckpt_dir, model, optimizer, global_step, epoch,
                          local_count, args.batch_size, num_train)
Example #36
0
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # In PyTorch 0.4, "volatile=True" is deprecated.
    torch.set_grad_enabled(True)

    if args.no_partialbn:
        model.module.partialBN(False)
    else:
        model.module.partialBN(True)

    # switch to train mode
    model.train()

    end = time.time()

    loss_summ = 0
    localtime = time.localtime()
    end_time = time.strftime("%Y/%m/%d-%H:%M:%S", localtime)
    for i, (input, target) in enumerate(train_loader):
        # discard final batch

        if i == len(train_loader) - 1:
            break
        # measure data loading time
        data_time.update(time.time() - end)

        # target size: [batch_size]
        target = target.cuda(async=True)
        input_var = input
        target_var = target

        # compute output, output size: [batch_size, num_class]

        output = model(input_var)

        loss = criterion(output, target_var)
        loss = loss / args.iter_size
        loss_summ += loss
        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss_summ.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))
        top5.update(prec5.item(), input.size(0))

        loss.backward()

        if (i + 1) % args.iter_size == 0:
            # scale down gradients when iter size is functioning

            optimizer.step()
            optimizer.zero_grad()
            loss_summ = 0
            #if i % args.print_freq == 0:
            print(('Epoch: [{0}][{1}/{2}], lr: {lr:.7f}\t'
                   'Time {batch_time.val:.2f} ({batch_time.avg:.2f})\t'
                   'UTime {end_time:} \t'
                   'Data {data_time.val:.2f} ({data_time.avg:.2f})\t'
                   'Loss {loss.val:.3f} ({loss.avg:.3f})\t'
                   'Prec@1 {top1.val:.2f} ({top1.avg:.2f})\t'
                   'Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(
                       epoch,
                       i,
                       len(train_loader),
                       batch_time=batch_time,
                       end_time=end_time,
                       data_time=data_time,
                       loss=losses,
                       top1=top1,
                       top5=top5,
                       lr=optimizer.param_groups[-1]['lr'])))

        if args.clip_gradient is not None:
            total_norm = clip_grad_norm_(model.parameters(),
                                         args.clip_gradient)
            if total_norm > args.clip_gradient:
                print("clipping gradient: {} with coef {}".format(
                    total_norm, args.clip_gradient / total_norm))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        localtime = time.localtime()
        end_time = time.strftime("%Y/%m/%d-%H:%M:%S", localtime)
    def train(engine, mini_batch):
        # You have to reset the gradients of all model parameters
        # before to take another step in gradient descent.
        engine.model.train()
        if engine.state.iteration % engine.config.iteration_per_update == 1 or \
            engine.config.iteration_per_update == 1:
            if engine.state.iteration > 1:
                engine.optimizer.zero_grad()

        device = next(engine.model.parameters()).device
        mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1])
        mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1])

        # Raw target variable has both BOS and EOS token.
        # The output of sequence-to-sequence does not have BOS token.
        # Thus, remove BOS token for reference.
        x, y = mini_batch.src, mini_batch.tgt[0][:, 1:]
        # |x| = (batch_size, length)
        # |y| = (batch_size, length)

        # Take sampling process because set False for is_greedy.
        y_hat, indice = engine.model.search(
            x, is_greedy=False, max_length=engine.config.max_length)

        with torch.no_grad():
            # Based on the result of sampling, get reward.
            actor_reward = MinimumRiskTrainingEngine._get_reward(
                indice,
                y,
                n_gram=engine.config.rl_n_gram,
                method=engine.config.rl_reward,
            )
            # |y_hat| = (batch_size, length, output_size)
            # |indice| = (batch_size, length)
            # |actor_reward| = (batch_size)

            # Take samples as many as n_samples, and get average rewards for them.
            # I figured out that n_samples = 1 would be enough.
            baseline = []

            for _ in range(engine.config.rl_n_samples):
                _, sampled_indice = engine.model.search(
                    x,
                    is_greedy=False,
                    max_length=engine.config.max_length,
                )
                baseline += [
                    MinimumRiskTrainingEngine._get_reward(
                        sampled_indice,
                        y,
                        n_gram=engine.config.rl_n_gram,
                        method=engine.config.rl_reward,
                    )
                ]

            baseline = torch.stack(baseline).mean(dim=0)
            # |baseline| = (n_samples, batch_size) --> (batch_size)

            # Now, we have relatively expected cumulative reward.
            # Which score can be drawn from actor_reward subtracted by baseline.
            reward = actor_reward - baseline
            # |reward| = (batch_size)

        # calculate gradients with back-propagation
        loss = MinimumRiskTrainingEngine._get_loss(y_hat,
                                                   indice,
                                                   reward=reward)
        backward_target = loss.div(y.size(0)).div(
            engine.config.iteration_per_update)
        backward_target.backward()

        p_norm = float(get_parameter_norm(engine.model.parameters()))
        g_norm = float(get_grad_norm(engine.model.parameters()))

        if engine.state.iteration % engine.config.iteration_per_update == 0 and \
            engine.state.iteration > 0:
            # In orther to avoid gradient exploding, we apply gradient clipping.
            torch_utils.clip_grad_norm_(
                engine.model.parameters(),
                engine.config.max_grad_norm,
            )
            # Take a step of gradient descent.
            engine.optimizer.step()

        return {
            'actor':
            float(actor_reward.mean()),
            'baseline':
            float(baseline.mean()),
            'reward':
            float(reward.mean()),
            '|param|':
            p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0.,
            '|g_param|':
            g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0.,
        }
Example #38
0
File: main.py Project: zymale/TIN
def train(train_loader, model, criterion, optimizer, epoch, log, tf_writer):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    mAPs = AverageMeter()

    if args.no_partialbn:
        model.module.partialBN(False)
    else:
        model.module.partialBN(True)

    # switch to train mode
    model.train()

    end = time.time()
    checkpoint_dir = os.path.join(args.root_model, args.store_name)
    for i, (input, target) in enumerate(train_loader):
        adjust_learning_rate(optimizer, epoch, args.lr_type, args.lr_steps, epoch + float(i) / len(train_loader))
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda()
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        if args.multi_class:
            mAP = calculate_mAP(output.data, target)
            mAPs.update(mAP, input.size(0))
        else:
            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
            top1.update(prec1.item(), input.size(0))
            top5.update(prec5.item(), input.size(0))

        losses.update(loss.item(), input.size(0))

        # compute gradient and do SGD step
        loss.backward()

        if args.clip_gradient is not None:
            total_norm = clip_grad_norm_(model.parameters(), args.clip_gradient)

        optimizer.step()
        optimizer.zero_grad()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            if args.multi_class:
                output = ('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
                          'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                          'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                          'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                          'mAP  {mAPs.val:.3f} ({mAPs.avg:.3f})'.format(
                    epoch, i, len(train_loader), batch_time=batch_time,
                    data_time=data_time, loss=losses, mAPs=mAPs, lr=optimizer.param_groups[2]['lr']))
                print(output)
                log.write(output + '\n')
                log.flush()
            else:
                output = ('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t'
                          'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                          'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                          'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                          'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                          'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                    epoch, i, len(train_loader), batch_time=batch_time,
                    data_time=data_time, loss=losses, top1=top1, top5=top5, lr=optimizer.param_groups[2]['lr']))
                print(output)
                log.write(output + '\n')
                log.flush()

    tf_writer.add_scalar('loss/train', losses.avg, epoch)
    if args.multi_class:
        tf_writer.add_scalar('acc/train_mAP', mAPs.avg, epoch)
    else:
        tf_writer.add_scalar('acc/train_top1', top1.avg, epoch)
        tf_writer.add_scalar('acc/train_top5', top5.avg, epoch)
    tf_writer.add_scalar('lr', optimizer.param_groups[-1]['lr'], epoch)
Example #39
0
    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse '
                                       'gradients, please consider '
                                       'SparseAdam instead')

                state = self.state[p]

                # State initialization
                if not state:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['next_m'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['next_v'] = torch.zeros_like(p.data)

                next_m, next_v = state['next_m'], state['next_v']
                beta1, beta2 = group['b1'], group['b2']

                # Add grad clipping
                if group['max_grad_norm'] > 0:
                    clip_grad_norm_(p, group['max_grad_norm'])

                # Decay the first and second moment running average coefficient
                # In-place operations to update the averages at the same time
                next_m.mul_(beta1).add_(1 - beta1, grad)
                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                update = next_m / (next_v.sqrt() + group['e'])

                # Just adding the square of the weights to the loss function
                # is *not* the correct way of using L2 regularization/weight
                # decay with Adam, since that will interact with the m and v
                #  parameters in strange ways.
                #
                # Instead we want to decay the weights in a manner that
                # doesn't interact with the m/v parameters. This is
                # equivalent to adding the square of the weights to the loss
                #  with plain (non-momentum) SGD.
                if group['weight_decay_rate'] > 0.0:
                    update += group['weight_decay_rate'] * p.data

                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
                    lr_scheduled = group['lr'] * schedule_fct(
                        state['step'] / group['t_total'], group['warmup'])
                else:
                    lr_scheduled = group['lr']

                update_with_lr = lr_scheduled * update
                p.data.add_(-update_with_lr)

                state['step'] += 1

                # step_size = lr_scheduled * math.sqrt(bias_correction2) / \
                #     bias_correction1
                # No bias correction
                # bias_correction1 = 1 - beta1 ** state['step']
                # bias_correction2 = 1 - beta2 ** state['step']

        return loss
Example #40
0
def train_epoch(model,
                bimpm,
                criterion,
                train_iter,
                valid_iter,
                config,
                start_epoch=1,
                others_to_save=None,
                valid_nli_iter=None):
    current_lr = config.rl_lr

    highest_valid_bleu = -np.inf
    no_improve_cnt = 0

    # Print initial valid BLEU before we start RL.
    model.eval()
    total_reward, sample_cnt = 0, 0
    for batch_index, batch in enumerate(valid_iter):
        current_batch_word_cnt = torch.sum(batch.tgt[1])
        x = batch.src
        y = batch.tgt[0][:, 1:]
        batch_size = y.size(0)
        # |x| = (batch_size, length)
        # |y| = (batch_size, length)

        # feed-forward
        y_hat, indice = model.search(x,
                                     is_greedy=True,
                                     max_length=config.max_length)
        # |y_hat| = (batch_size, length, output_size)
        # |indice| = (batch_size, length)

        reward = get_bleu_reward(y, indice, n_gram=config.rl_n_gram)

        total_reward += float(reward.sum())
        sample_cnt += batch_size
        if sample_cnt >= len(valid_iter.dataset.examples):
            break
    avg_bleu = total_reward / sample_cnt
    print("initial valid BLEU: %.4f" %
          avg_bleu)  # You can figure-out improvement.

    if valid_nli_iter:
        nli_validation(valid_nli_iter, model, bimpm, config)
    model.train()  # Now, begin training.

    # Start RL
    nli_criterion = nn.CrossEntropyLoss(reduce=False)
    print("start rl epoch:", start_epoch)
    print("number of epoch to complete:", config.rl_n_epochs + 1)

    if config.reward_mode == 'combined':
        if config.gpu_id >= 0:
            nli_weight = torch.tensor([1.0], requires_grad=True, device="cuda")
            bleu_weight = torch.tensor([1.0],
                                       requires_grad=True,
                                       device="cuda")
        else:
            nli_weight = torch.tensor([1.0], requires_grad=True)
            bleu_weight = torch.tensor([1.0], requires_grad=True)

        print("nli_weight, bleu_weight:",
              nli_weight.data.cpu().numpy()[0],
              bleu_weight.data.cpu().numpy()[0])
        weight_optimizer = optim.Adam(iter([nli_weight, bleu_weight]),
                                      lr=0.0001)

    optimizer = optim.SGD(
        model.parameters(),
        lr=current_lr,
    )  # Default hyper-parameter is set for SGD.
    print("current learning rate: %f" % current_lr)
    print(optimizer)

    for epoch in range(start_epoch, config.rl_n_epochs + 1):
        sample_cnt = 0
        total_loss, total_actor_loss, total_sample_count, total_word_count, total_parameter_norm, total_grad_norm = 0, 0, 0, 0, 0, 0
        start_time = time.time()
        train_loss = np.inf
        epoch_accuracy = []

        for batch_index, batch in enumerate(train_iter):
            optimizer.zero_grad()

            current_batch_word_cnt = torch.sum(batch.tgt[1])
            x = batch.src
            y = batch.tgt[0][:, 1:]
            batch_size = y.size(0)
            if config.reward_mode != 'bleu':
                premise = batch.premise
                hypothesis = batch.hypothesis
                isSrcPremise = batch.isSrcPremise
                label = batch.labels

            # |x| = (batch_size, length)
            # |y| = (batch_size, length)

            # Take sampling process because set False for is_greedy.
            y_hat, indice = model.search(x,
                                         is_greedy=False,
                                         max_length=config.max_length)

            if config.reward_mode == 'bleu':
                q_actor = get_bleu_reward(y, indice, n_gram=config.rl_n_gram)
                epoch_accuracy.append(q_actor.sum() / batch_size)
            else:
                padded_indice, padded_premise, padded_hypothesis = padding_three_tensors(
                    indice, premise, hypothesis, batch_size)

                # put pred sentece into either premise and hypothesis
                for i in range(batch_size):
                    if not isSrcPremise[i]:
                        padded_premise[i] = padded_indice[i]
                    else:
                        padded_hypothesis[i] = padded_indice[i]

                kwargs = {'p': padded_premise, 'h': padded_hypothesis}
                pred_logit = bimpm(**kwargs)
                accuracy = get_accuracy(pred_logit, label)
                epoch_accuracy.append(accuracy)

                # Based on the result of sampling, get reward.
                if config.reward_mode == 'nli':
                    q_actor = -get_nli_reward(pred_logit, label, nli_criterion)
                else:
                    q_actor = 1/(2 * nli_weight.pow(2)) * -get_nli_reward(pred_logit, label, nli_criterion) \
                        + 1/(2 * bleu_weight.pow(2)) * (get_bleu_reward(y, indice, n_gram=config.rl_n_gram)/100) \
                        + torch.log(nli_weight * bleu_weight)
            # |y_hat| = (batch_size, length, output_size)
            # |indice| = (batch_size, length)
            # |q_actor| = (batch_size)

            # Take samples as many as n_samples, and get average rewards for them.
            # I figured out that n_samples = 1 would be enough.
            baseline = []
            with torch.no_grad():
                for i in range(config.n_samples):
                    _, sampled_indice = model.search(
                        x, is_greedy=False, max_length=config.max_length)

                    if config.reward_mode == 'bleu':
                        baseline_reward = get_bleu_reward(
                            y, sampled_indice, n_gram=config.rl_n_gram)
                        epoch_accuracy.append(baseline_reward.sum() /
                                              batch_size)
                    else:
                        padded_sampled_indice, padded_premise, padded_hypothesis = padding_three_tensors(
                            sampled_indice, premise, hypothesis, batch_size)

                        # put pred sentece into either premise and hypothesis
                        for i in range(batch_size):
                            if not isSrcPremise[i]:
                                padded_premise[i] = padded_sampled_indice[i]
                            else:
                                padded_hypothesis[i] = padded_sampled_indice[i]

                        kwargs = {'p': padded_premise, 'h': padded_hypothesis}
                        pred_logit = bimpm(**kwargs)
                        accuracy = get_accuracy(pred_logit, label)
                        epoch_accuracy.append(accuracy)

                        # Based on the result of sampling, get reward.
                        if config.reward_mode == 'nli':
                            baseline_reward = -get_nli_reward(
                                pred_logit, label, nli_criterion)
                        else:
                            baseline_reward = 1/(2 * nli_weight.pow(2)) * -get_nli_reward(pred_logit, label, nli_criterion) \
                                + 1/(2 * bleu_weight.pow(2)) * (get_bleu_reward(y, sampled_indice, n_gram=config.rl_n_gram)/100) \
                                + torch.log(nli_weight * bleu_weight)

                    baseline += [baseline_reward]
                baseline = torch.stack(baseline).sum(dim=0).div(
                    config.n_samples)
                # |baseline| = (n_samples, batch_size) --> (batch_size)

            # Now, we have relatively expected cumulative reward.
            # Which score can be drawn from q_actor subtracted by baseline.
            tmp_reward = q_actor - baseline
            # |tmp_reward| = (batch_size)
            # calcuate gradients with back-propagation
            get_gradient(indice, y_hat, criterion, reward=tmp_reward)

            # simple math to show stats
            total_loss += float(tmp_reward.sum())
            total_actor_loss += float(q_actor.sum())
            total_sample_count += batch_size
            total_word_count += int(current_batch_word_cnt)
            total_parameter_norm += float(
                utils.get_parameter_norm(model.parameters()))
            total_grad_norm += float(utils.get_grad_norm(model.parameters()))

            if (batch_index + 1) % config.print_every == 0:
                avg_loss = total_loss / total_sample_count
                avg_actor_loss = total_actor_loss / total_sample_count
                avg_parameter_norm = total_parameter_norm / config.print_every
                avg_grad_norm = total_grad_norm / config.print_every
                avg_epoch_accuracy = sum(epoch_accuracy) / len(epoch_accuracy)
                elapsed_time = time.time() - start_time

                print(
                    "epoch: %d batch: %d/%d\t|param|: %.2f\t|g_param|: %.2f\trwd: %.4f\tactor loss: %.4f\tAccuracy: %.2f\t%5d words/s %3d secs"
                    %
                    (epoch, batch_index + 1,
                     int(
                         len(train_iter.dataset.examples) //
                         config.batch_size), avg_parameter_norm, avg_grad_norm,
                     avg_loss, avg_actor_loss, avg_epoch_accuracy,
                     total_word_count // elapsed_time, elapsed_time))

                if config.reward_mode == 'combined':
                    print("nli_weight, bleu_weight:",
                          nli_weight.data.cpu().numpy()[0],
                          bleu_weight.data.cpu().numpy()[0])

                total_loss, total_actor_loss, total_sample_count, total_word_count, total_parameter_norm, total_grad_norm = 0, 0, 0, 0, 0, 0
                epoch_accuracy = []
                start_time = time.time()

                train_loss = avg_actor_loss

            # In orther to avoid gradient exploding, we apply gradient clipping.
            torch_utils.clip_grad_norm_(model.parameters(),
                                        config.max_grad_norm)
            # Take a step of gradient descent.
            optimizer.step()
            if config.reward_mode == 'combined':
                weight_optimizer.step()

            sample_cnt += batch_size
            if sample_cnt >= len(train_iter.dataset.examples):
                break

        sample_cnt = 0
        total_reward = 0

        # Start validation
        with torch.no_grad():
            model.eval()  # Turn-off drop-out

            for batch_index, batch in enumerate(valid_iter):
                current_batch_word_cnt = torch.sum(batch.tgt[1])
                x = batch.src
                y = batch.tgt[0][:, 1:]
                batch_size = y.size(0)
                # |x| = (batch_size, length)
                # |y| = (batch_size, length)

                # feed-forward
                y_hat, indice = model.search(x,
                                             is_greedy=True,
                                             max_length=config.max_length)
                # |y_hat| = (batch_size, length, output_size)
                # |indice| = (batch_size, length)

                reward = get_bleu_reward(y, indice, n_gram=config.rl_n_gram)

                total_reward += float(reward.sum())
                sample_cnt += batch_size
                if sample_cnt >= len(valid_iter.dataset.examples):
                    break

            avg_bleu = total_reward / sample_cnt
            print("valid BLEU: %.4f" % avg_bleu)

            if highest_valid_bleu < avg_bleu:
                highest_valid_bleu = avg_bleu
                no_improve_cnt = 0
            else:
                no_improve_cnt += 1

            if valid_nli_iter:
                nli_validation(valid_nli_iter, model, bimpm, config)
            model.train()

        model_fn = config.model.split(".")
        model_fn = model_fn[:-1] + [
            "%02d" % (config.n_epochs + epoch),
            "%.2f-%.4f" % (train_loss, avg_bleu)
        ] + [model_fn[-1]] + [config.reward_mode]

        # PyTorch provides efficient method for save and load model, which uses python pickle.
        to_save = {
            "model": model.state_dict(),
            "config": config,
            "epoch": config.n_epochs + epoch + 1,
            "current_lr": current_lr
        }
        if others_to_save is not None:
            for k, v in others_to_save.items():
                to_save[k] = v
        torch.save(to_save, '.'.join(model_fn))

        if config.early_stop > 0 and no_improve_cnt > config.early_stop:
            break
Example #41
0
                batch.clear()

                optimizer.zero_grad()
                logits_v, value_v = net(states_v)

                loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

                log_prob_v = F.log_softmax(logits_v, dim=1)
                adv_v = vals_ref_v - value_v.detach()
                log_prob_actions_v = adv_v * log_prob_v[range(BATCH_SIZE), actions_t]
                loss_policy_v = -log_prob_actions_v.mean()

                prob_v = F.softmax(logits_v, dim=1)
                entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean()

                loss_v = entropy_loss_v + loss_value_v + loss_policy_v
                loss_v.backward()
                nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
                optimizer.step()

                tb_tracker.track("advantage", adv_v, step_idx)
                tb_tracker.track("values", value_v, step_idx)
                tb_tracker.track("batch_rewards", vals_ref_v, step_idx)
                tb_tracker.track("loss_entropy", entropy_loss_v, step_idx)
                tb_tracker.track("loss_policy", loss_policy_v, step_idx)
                tb_tracker.track("loss_value", loss_value_v, step_idx)
                tb_tracker.track("loss_total", loss_v, step_idx)
                tb_tracker.track("dict_size", len(preprocessor), step_idx)

    pass