Exemple #1
0
    def run(self):
        logger.debug('loader %d start' % self.thread_id)
        while True:
            items = list()

            for _ in range(self.batch_size):
                if self.index >= self.dataset_count:
                    break

                input, label = self.dataset.get_item(self.index)

                if input is not None:
                    items.append((input, label))

                self.index += 1

            if len(items) == 0:
                batch = self.create_empty_batch()
                self.queue.put(batch)
                break

            random.shuffle(items)

            batch = self.collate_fn(items)
            self.queue.put(batch)

        logger.debug('loader %d stop' % self.thread_id)
Exemple #2
0
def supervised_train(model, config, epoch, total_time_step, queue,
                     criterion, optimizer, device, train_begin, worker_num,
                     print_every=10, teacher_forcing_ratio=0.90):
    r"""
    Args:
        train_begin: train begin time
        total_time_step: total time step in epoch
        epoch (int): present epoch
        config (Config): configuration
        model (torch.nn.Module): Model to be trained
        optimizer (torch.optim): optimizer for training
        teacher_forcing_ratio (float):  The probability that teacher forcing will be used (default: 0.90)
        print_every (int): Parameters to determine how many steps to output
        queue (Queue.queue): queue for threading
        criterion (torch.nn): one of PyTorch’s loss function.
          Refer to http://pytorch.org/docs/master/nn.html#loss-functions for a list of them.
        device (torch.cuda): device used ('cuda' or 'cpu')
        worker_num (int): the number of cpu cores used

    Returns: loss, cer
        - **loss** (float): loss of present epoch
        - **cer** (float): character error rate
    """
    epoch_loss_total = 0.
    print_loss_total = 0.
    total_num = 0
    total_dist = 0
    total_length = 0
    time_step = 0
    decay_speed = 1.0

    RAMPUP_POWER = 3
    RANMPUP_PERIOD = 3000
    EXP_DECAY_PERIOD = total_time_step * 3

    model.train()
    begin = epoch_begin = time.time()

    while True:
        # LR Wamp-Up
        if config.use_multistep_lr and epoch == 0 and time_step < RANMPUP_PERIOD:
            set_lr(optimizer, lr=config.high_plateau_lr * ((time_step + 1) / RANMPUP_PERIOD) ** RAMPUP_POWER)

        # LR Exponential-Decay
        if config.use_multistep_lr and (epoch == 1 or epoch == 2 or epoch == 3):
            decay_rate = config.low_plateau_lr / config.high_plateau_lr
            decay_speed *= decay_rate ** (1 / EXP_DECAY_PERIOD)
            set_lr(optimizer, config.high_plateau_lr * decay_speed)

        feats, scripts, feat_lens, target_lens = queue.get()

        if feats.shape[0] == 0:
            # empty feats means closing one loader
            worker_num -= 1
            logger.debug('left train_loader: %d' % worker_num)

            if worker_num == 0:
                break
            else:
                continue

        inputs = feats.to(device)
        scripts = scripts.to(device)
        targets = scripts[:, 1:]

        model.module.flatten_parameters()
        y_hat, logit = model(inputs, scripts, teacher_forcing_ratio=teacher_forcing_ratio)

        loss = criterion(logit.contiguous().view(-1, logit.size(-1)), targets.contiguous().view(-1))
        epoch_loss_total += loss.item()
        print_loss_total += loss.item()

        total_num += sum(feat_lens)
        dist, length = get_distance(targets, y_hat, id2char, EOS_TOKEN)
        total_dist += dist
        total_length += length

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        time_step += 1
        torch.cuda.empty_cache()

        if time_step % print_every == 0:
            current = time.time()
            elapsed = current - begin
            epoch_elapsed = (current - epoch_begin) / 60.0
            train_elapsed = (current - train_begin) / 3600.0

            logger.info('timestep: {:4d}/{:4d}, loss: {:.4f}, cer: {:.2f}, elapsed: {:.2f}s {:.2f}m {:.2f}h'.format(
                time_step,
                total_time_step,
                print_loss_total / print_every,
                total_dist / total_length,
                elapsed, epoch_elapsed, train_elapsed)
            )
            print_loss_total = 0
            begin = time.time()

        if time_step % 1000 == 0:
            save_step_result(train_step_result, epoch_loss_total / total_num, total_dist / total_length)

        if time_step % 10000 == 0:
            torch.save(model, "./data/weight_file/epoch_%s_step_%s.pt" % (str(epoch), str(time_step)))

    logger.info('train() completed')

    return epoch_loss_total / total_num, total_dist / total_length
Exemple #3
0
def supervised_train(model,
                     hparams,
                     epoch,
                     total_time_step,
                     queue,
                     criterion,
                     optimizer,
                     device,
                     train_begin,
                     worker_num,
                     print_time_step=10,
                     teacher_forcing_ratio=0.90):
    """
    Args:
        model (torch.nn.Module): Model to be trained
        optimizer (torch.optim): optimizer for training
        teacher_forcing_ratio (float):  The probability that teacher forcing will be used (default: 0.90)
        print_time_step (int): Parameters to determine how many steps to output
        queue (Queue.queue): queue for threading
        criterion (torch.nn): one of PyTorch’s loss function. Refer to http://pytorch.org/docs/master/nn.html#loss-functions for a list of them.
        device (torch.cuda): device used ('cuda' or 'cpu')
        worker_num (int): the number of cpu cores used

    Returns: loss, cer
        - **loss** (float): loss of present epoch
        - **cer** (float): character error rate
    """
    total_loss = 0.
    total_num = 0
    total_dist = 0
    total_length = 0
    total_sent_num = 0
    time_step = 0

    model.train()
    begin = epoch_begin = time.time()

    while True:
        if hparams.use_multistep_lr and epoch == 0 and time_step < 1000:
            ramp_up(optimizer, time_step, hparams)
        if hparams.use_multistep_lr and epoch == 1:
            exp_decay(optimizer, total_time_step, hparams)
        feats, targets, feat_lengths, label_lengths = queue.get()
        if feats.shape[0] == 0:
            # empty feats means closing one loader
            worker_num -= 1
            logger.debug('left train_loader: %d' % (worker_num))

            if worker_num == 0:
                break
            else:
                continue
        optimizer.zero_grad()

        feats = feats.to(device)
        targets = targets.to(device)
        target = targets[:, 1:]
        model.module.flatten_parameters()

        y_hat, logit = model(feats,
                             targets,
                             teacher_forcing_ratio=teacher_forcing_ratio)
        loss = criterion(logit.contiguous().view(-1, logit.size(-1)),
                         target.contiguous().view(-1))

        total_loss += loss.item()
        total_num += sum(feat_lengths)
        dist, length = get_distance(target, y_hat, id2char, EOS_TOKEN)
        total_dist += dist
        total_length += length
        total_sent_num += target.size(0)
        loss.backward()
        optimizer.step()

        if time_step % print_time_step == 0:
            current = time.time()
            elapsed = current - begin
            epoch_elapsed = (current - epoch_begin) / 60.0
            train_elapsed = (current - train_begin) / 3600.0

            logger.info(
                'timestep: {:4d}/{:4d}, loss: {:.4f}, cer: {:.2f}, elapsed: {:.2f}s {:.2f}m {:.2f}h'
                .format(time_step, total_time_step, total_loss / total_num,
                        total_dist / total_length, elapsed, epoch_elapsed,
                        train_elapsed))
            begin = time.time()

        if time_step % 1000 == 0:
            save_step_result(train_step_result, total_loss / total_num,
                             total_dist / total_length)

        if time_step % 10000 == 0:
            torch.save(model, "model.pt")
            torch.save(
                model, "./data/weight_file/epoch_%s_step_%s.pt" %
                (str(epoch), str(time_step)))

        time_step += 1
        supervised_train.cumulative_batch_count += 1
        torch.cuda.empty_cache(
        )  # GPU memory free. if you have enough GPU memory, delete this line

    loss = total_loss / total_num
    cer = total_dist / total_length
    logger.info('train() completed')
    return loss, cer
Exemple #4
0
def supervised_train(model, queue, perplexity, optimizer, device, print_every,
                     epoch, teacher_forcing_ratio, worker_num, total_time_step,
                     train_begin):
    print_loss_total = 0  # Reset every print_every
    epoch_loss_total = 0  # Reset every epoch
    total_num = 0
    time_step = 0

    model.train()
    begin = epoch_begin = time.time()

    while True:
        loss = perplexity
        inputs, targets, input_lens, target_lens = queue.get()

        if inputs.shape[0] == 0:
            # empty feats means closing one loader
            worker_num -= 1
            logger.debug('left train_loader: %d' % worker_num)

            if worker_num == 0:
                break
            else:
                continue

        inputs = inputs.to(device)
        targets = targets.to(device)

        model.module.flatten_parameters()
        outputs = model(inputs, teacher_forcing_ratio=teacher_forcing_ratio)

        # Get loss
        loss.reset()
        for step, step_output in enumerate(outputs):
            batch_size = targets.size(0)
            loss.eval_batch(step_output.contiguous().view(batch_size, -1),
                            targets[:, step])
        # Backpropagation
        model.zero_grad()
        loss.backward()
        optimizer.step()
        loss = loss.get_loss()

        epoch_loss_total += loss
        print_loss_total += loss
        total_num += sum(input_lens)

        time_step += 1
        torch.cuda.empty_cache()

        if time_step % print_every == 0:
            current = time.time()
            elapsed = current - begin
            epoch_elapsed = (current - epoch_begin) / 60.0
            train_elapsed = (current - train_begin) / 3600.0

            logger.info(
                'timestep: {:4d}/{:4d}, perplexity: {:.4f}, elapsed: {:.2f}s {:.2f}m {:.2f}h'
                .format(time_step, total_time_step,
                        print_loss_total / print_every, elapsed, epoch_elapsed,
                        train_elapsed))
            print_loss_total = 0
            begin = time.time()

        if time_step % 50000 == 0:
            torch.save(model,
                       "./data/epoch%s_%s.pt" % (str(epoch), str(time_step)))

    logger.info('train() completed')

    return epoch_loss_total / total_num