Ejemplo n.º 1
0
def _load_or_create_model(epochs, dataset, continue_from, learning_rate,
                          rnn_type, hidden_size, hidden_layers, momentum, cuda,
                          tensorboard_writer):
    weights_dir = _util.getRelWeightsPath(dataset)
    if continue_from:
        continue_from = _get_checkpoint_filepath(weights_dir, continue_from)
        print('Loading checkpoint model {}'.format(continue_from))

        package = torch.load(continue_from,
                             map_location=lambda storage, loc: storage)
        model = _model.LipReader.load_model_package(package)
        labels = _model.LipReader.get_labels(model)

        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=learning_rate,
                                    momentum=momentum,
                                    nesterov=True)
        optimizer.load_state_dict(package['optim_dict'])

        # Index start at 0 for training
        start_epoch = int(package.get('epoch', 1)) - 1
        start_iter = package.get('iteration', None)
        if start_iter is None:
            # We saved model after epoch finished, start at the next epoch.
            start_epoch += 1
            start_iter = 0
        else:
            start_iter += 1
        avg_tr_loss = int(package.get('avg_tr_loss', 0))
        avg_val_loss = int(package.get('avg_val_loss', 0))
        tr_loss_results = package['tr_loss_results']
        val_loss_results = package['val_loss_results']
        cer_results = package['cer_results']
        wer_results = package['wer_results']

        # Previous scores to tensorboard logs
        if tensorboard_writer and package[
                'tr_loss_results'] is not None and start_epoch > 0:
            # REVIEW josephz: Also include train?
            # package['tr_loss_results']
            for i, (val_loss, wer, cer) in enumerate(
                    zip(package['val_loss_results'],
                        package['val_cer_results'],
                        package['val_wer_results'])):
                _tensorboard_log(tensorboard_writer,
                                 dataset,
                                 i,
                                 val_loss,
                                 wer,
                                 cer,
                                 mode="Validation")
    else:
        avg_tr_loss = avg_val_loss = start_iter = start_epoch = 0
        tr_loss_results = torch.Tensor(epochs)
        val_loss_results = torch.Tensor(epochs)
        cer_results = torch.Tensor(epochs)
        wer_results = torch.Tensor(epochs)

        labels_path = os.path.join(weights_dir, 'labels.json')
        try:
            with open(labels_path) as label_file:
                labels = str(''.join(json.load(label_file)))
        except:
            labels = _labels
            _getSharedLogger().warning(
                "Could not open '{}'... using hardcoded labels: '{}'".format(
                    labels_path, labels))

        rnn_type = rnn_type.lower()
        assert rnn_type in _model.supported_rnns, "rnn_type should be either lstm, rnn or gru"

        model = _model.LipReader(rnn_hidden_size=hidden_size,
                                 nb_layers=hidden_layers,
                                 labels=labels,
                                 rnn_type=_model.supported_rnns[rnn_type])
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=learning_rate,
                                    momentum=momentum,
                                    nesterov=True)

    if cuda:
        model.cuda()
    return labels, model, optimizer, \
      (avg_tr_loss, avg_val_loss, start_iter, start_epoch), \
      (tr_loss_results, val_loss_results, cer_results, wer_results)
Ejemplo n.º 2
0
def train(
    epochs=-1,
    dataset=None,
    batch=-1,
    checkpoint=False,
    train_split=-1.0,
    num_workers=-1,
    hidden_size=-1,
    hidden_layers=-1,
    rnn_type=None,
    cuda=False,
    learning_rate=-1.0,
    momentum=-1.0,
    max_norm=-1,
    annealing=-1.0,
    silent=False,
    tensorboard=False,
    continue_from=-1,
    seed=123456,
):
    """ Runs the primary training loop.

  :param epochs: Number of epochs to train for.
  :param dataset: Location containing dataset generated by 'generate_dataview'.
  :param batch: Number of sequences that are trained concurrently.
  :param checkpoint: Whether or not to save checpoints for each epoch.
  :param train_split: Fraction of videos which will be in the train set, (1 - train_split) will be validation.
  :param num_workers: Number of workers to use during dataset loading.
  :param hidden_size: Number of hidden units in the RNN.
  :param hidden_layers: Number of hiddel layers in RNN.
  :param rnn_type: Type of RNN cell to use; either rnn, gru, or lstm.
  :param cuda: Use CUDA to train this model.
  :param learning_rate: Initial training learning rate.
  :param momentum: Nesterov SGD momentum.
  :param max_norm: L2 norm cutoff to prevent gradient explosion.
  :param annealing: Annealing applied to learning rate every epoch.
  :param silent: Turn off progress tracking per iteration.
  :param tensorboard: Turn on tensorboard graphing.
  :param continue_from: Checkpoint number to start from.
  """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    weights_dir = _util.getRelWeightsPath(dataset)
    dataset_dir = _util.getRelDatasetsPath(dataset)

    tensorboard_writer = _get_tensorboard_writer(weights_dir, tensorboard)

    # REVIEW josephz: Can this be further broken down?
    labels, model, optimizer, \
      (avg_tr_loss, avg_val_loss, start_iter, start_epoch), \
      (tr_loss_results, val_loss_results, cer_results, wer_results) = _load_or_create_model(
          epochs, dataset, continue_from, learning_rate,
          rnn_type, hidden_size, hidden_layers,
          momentum, cuda, tensorboard_writer)

    (train_dataset, train_loader), (test_dataset, test_loader) = _get_datasets(
        dataset_dir, train_split, labels, batch, num_workers)

    best_wer = None
    batch_time, data_time, tr_losses, val_losses = _init_averages()

    print(model)
    print("Number of parameters: %d" % _model.LipReader.get_param_size(model))

    # josephz: CTCLoss, see https://github.com/SeanNaren/warp-ctc
    criterion = CTCLoss()
    decoder = _decoder.GreedyDecoder(labels)

    for epoch in range(start_epoch, epochs):
        model.train()
        epoch_start = time.time()

        for i, (data) in enumerate(train_loader, start=start_iter):
            batch_start = time.time()

            inputs, targets, input_percentages, target_sizes = data
            assert len(inputs.shape) == 4 and inputs.shape[2:] == (68, 3)
            batch_size, seq_len, num_pts, pts_dim = inputs.shape
            input_sizes = input_percentages.mul(int(inputs.size(1))).int()

            # Measure elapsed data loading time.
            data_time.update(time.time() - batch_start)

            if cuda:
                inputs = inputs.cuda()

            out, output_sizes = model(inputs, input_sizes)
            out = out.transpose(0, 1)  # TxNxH

            # acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax)
            # labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence
            # act_lens: Tensor of size (batch) containing size of each output sequence from the network
            # label_lens: Tensor of (batch) containing label length of each example
            assert len(targets.shape) == 1
            assert len(
                out.shape) == 3 and out.shape[:2] == (seq_len, batch_size)
            tr_loss = criterion(out, targets, output_sizes, target_sizes)
            # Average loss by minibatch.
            tr_loss /= inputs.size(0)

            val_loss_value = tr_loss.item()
            if val_loss_value == np.inf or val_loss_value == -np.inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                val_loss_value = 0

            avg_tr_loss += val_loss_value
            tr_losses.update(val_loss_value, inputs.size(0))

            # Compute gradient.
            optimizer.zero_grad()
            tr_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)

            # SDG step!
            optimizer.step()

            # Measure elapsed batch time.
            batch_time.update(time.time() - batch_start)
            if not silent:
                print('Epoch[{}][{}{}]'.format(epoch + 1, i + 1,
                                               len(train_loader)),
                      end='\t')
                print('Time {:0.3f} ({:0.3f})'.format(batch_time.val,
                                                      batch_time.avg),
                      end='\t')
                print('Data {:0.3f} ({:0.3f})'.format(data_time.val,
                                                      data_time.avg),
                      end='\t')
                print('Loss {:0.4f} ({:0.4f})'.format(tr_losses.val,
                                                      tr_losses.avg))
        avg_tr_loss /= len(train_loader)

        print('Training Summary Epoch: [{}]'.format(epoch + 1), end='\t')
        print('Time taken (s): {:0.0f}'.format(time.time() - epoch_start))
        print('Time taken (s): {:0.0f}'.format(time.time() - epoch_start))
        print('Average Training Loss: {:0.3f}'.format(avg_tr_loss))

        # Reset start iteration in preparation for next epoch.
        start_iter = 0

        total_cer = total_wer = 0
        model.eval()
        with torch.no_grad():
            for i, (data) in tqdm.tqdm(enumerate(test_loader),
                                       total=len(test_loader)):
                inputs, targets, input_percentages, target_sizes = data
                input_sizes = input_percentages.mul_(int(inputs.size(1))).int()
                batch_size, seq_len, num_pts, pts_dim = inputs.shape

                # Unflatten targets?
                split_targets = []
                offset = 0
                for size in target_sizes:
                    split_targets.append(targets[offset:offset + size])
                    offset += size

                if cuda:
                    inputs = inputs.cuda()

                out, output_sizes = model(inputs, input_sizes)
                out_loss = out.transpose(0, 1)  # TxNxH
                assert len(targets.shape) == 1
                assert len(
                    out_loss.shape) == 3 and out_loss.shape[:2] == (seq_len,
                                                                    batch_size)
                # out is supposed to be (seqLength x batch x outputDim).
                val_loss = criterion(out_loss, targets, output_sizes,
                                     target_sizes)

                val_loss_value = val_loss.item()
                if val_loss_value == np.inf or val_loss_value == -np.inf:
                    print(
                        "WARNING: received an inf loss, setting loss value to 0"
                    )
                    val_loss_value = 0

                avg_val_loss += val_loss_value
                val_losses.update(val_loss_value, inputs.size(0))

                decoded_output, _ = decoder.decode(out.data, output_sizes)
                target_strings = decoder.convert_to_strings(split_targets)

                for x in range(len(target_strings)):
                    transcript, reference = decoded_output[x][
                        0], target_strings[x][0]
                    total_wer += decoder.wer(transcript, reference) / float(
                        len(reference.split()))
                    total_cer += decoder.cer(transcript, reference) / float(
                        len(reference))
            avg_val_loss /= len(test_loader)

            val_loss_results[epoch] = avg_val_loss
            wer = wer_results[epoch] = 100 * total_wer / len(
                test_loader.dataset)  # .dataset?
            cer = cer_results[epoch] = 100 * total_cer / len(
                test_loader.dataset)

            print('Validation Summary Epoch: [{}]'.format(epoch + 1), end='\t')
            print('Average WER: {:0.3f}'.format(wer_results[epoch]), end='\t')
            print('Average CER: {:0.3f}'.format(cer_results[epoch]), end='\t')
            print('Average Validation Loss: {:0.3f}'.format(avg_val_loss))

            if tensorboard:
                _tensorboard_log(tensorboard_writer,
                                 dataset,
                                 epoch + 1,
                                 avg_val_loss,
                                 wer,
                                 cer,
                                 mode="Validation")
            if checkpoint:
                weights_path = _get_checkpoint_filepath(weights_dir, epoch + 1)
                torch.save(
                    _model.LipReader.serialize(model,
                                               optimizer=optimizer,
                                               epoch=epoch,
                                               loss_results=val_loss_results,
                                               wer_results=wer_results,
                                               cer_results=cer_results),
                    weights_path)

            # Do annealing.
            optim_state = optimizer.state_dict()
            optim_state['param_groups'][0]['lr'] /= annealing
            optimizer.load_state_dict(optim_state)

            if best_wer is None or best_wer > wer_results[epoch]:
                print('Found better validated model, saving to {}'.format)
                model_path = os.path.join(weights_dir, 'model.pth')
                weights_path = _get_checkpoint_filepath(weights_dir, epoch + 1)

                if os.path.isfile(weights_path):
                    shutil.copyfile(weights_path, model_path)
                else:
                    torch.save(
                        _model.LipReader.serialize(
                            model,
                            optimizer=optimizer,
                            epoch=epoch,
                            loss_results=val_loss_results,
                            wer_results=wer_results,
                            cer_results=cer_results), model_path)
                best_wer = wer
                avg_tr_loss = 0
Ejemplo n.º 3
0
hidden_size=800
hidden_layers=5
train_split=0.5
rnn_type="gru"
epochs=70
cuda=False
learning_rate=3e-4
momentum=0.9
max_norm=400
anneal=1.1
silent=True
checkpoint=True
tensorboard=True
continue_from=0

weights_dir = _util.getRelWeightsPath(dataset)
dataset_dir = _util.getRelDatasetsPath(dataset)
raw_dir = _util.getRelRawPath(dataset)

# tensorboard_writer = _train._get_tensorboard_writer(weights_dir, tensorboard)
#
# labels, model, optimizer, \
#   avg_tr_loss, avg_val_loss, start_iter, start_epoch, \
#   loss_results, val_loss_results, cer_results, wer_results = _train._load_or_create_model(
#     epochs, dataset, continue_from, learning_rate, rnn_type, hidden_size, hidden_layers, momentum, cuda, tensorboard_writer)
#
# (train_dataset, train_loader), (test_dataset, test_loader) = _train._get_datasets(
#   dataset_dir, train_split, labels, batch, num_workers)
#
# for i, (data) in enumerate(train_loader, start=start_iter):
#   assert len(data) == 2
Ejemplo n.º 4
0
def train(
    data="StephenColbert/medium_no_vtx1",
    labels="labels.json",
    sentence_dataset=False,
    occlussion_threshold=0.8,
    train_split=0.8,
    num_workers=1,
    refresh=False,
    patience=10,
    batch_size=4,
    learning_rate=1e-4,
    annealings=2,
    enable_ctc=False,
    grad_norm=50,
    tr_epochs=50,
    max_tfr=0.9,
    min_tfr=0.0,
    num_layers=1,
    frame_dim=68 * 3,
    hidden_size=700,
    char_dim=300,
    rnn_type='LSTM',
    attention_type='1_layer_nn',
    attn_hidden_size=-1,
    bidirectional=False,
    rnn_dropout=0.0,
    seed=123456,
    cuda=False,
):
    """ Runs the primary training loop.

  :param data:
  :param labels:
  :param sentence_dataset:
  :param occlussion_threshold:
  :param train_split:
  :param num_workers:
  :param patience:
  :param batch_size:
  :param learning_rate:
  :param annealings: Number of times to anneal learning rate before training is finished.
  :param enable_ctc:
  :param max_tfr:
  :param grad_norm:
  :param num_layers:
  :param frame_dim:
  :param hidden_size:
  :param char_dim:
  :param rnn_type:
  :param attention_type:
  :param attn_hidden_size:
  :param bidirectional:
  :param rnn_dropout:
  :param seed:
  :param cuda:
  """
    # Setup seed.
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    rand = np.random.RandomState(seed=seed)

    # Setup device.
    # REVIEW josephz: Is there a clean way to use multiple or different GPUs?
    device = torch.device('cuda') if cuda else torch.device('cpu')
    print("Device: ", device)

    # Init Data.
    print("Initializing dataset '{}'".format(data))
    train_dataset, val_dataset, test_dataset = _get_datasets(
        data,
        train_split,
        sentence_dataset,
        threshold=occlussion_threshold,
        labels=labels,
        rand=rand,
        refresh=refresh,
        include_test=True)
    train_loader = _data.DataLoader(train_dataset,
                                    batch_size=batch_size,
                                    num_workers=num_workers,
                                    collate_fn=_data_loader._collate_fn)
    val_loader = _data.DataLoader(val_dataset,
                                  batch_size=batch_size,
                                  num_workers=num_workers,
                                  collate_fn=_data_loader._collate_fn)
    test_loader = _data.DataLoader(test_dataset,
                                   batch_size=batch_size,
                                   num_workers=num_workers,
                                   collate_fn=_data_loader._collate_fn)
    # Init Models.
    print("Initializing model")
    encoder, decoding_step = _init_models(train_dataset.char2idx, num_layers,
                                          frame_dim, hidden_size, char_dim,
                                          enable_ctc, rnn_type, attention_type,
                                          attn_hidden_size, bidirectional,
                                          rnn_dropout, device)

    # Initialize Logging.
    weights_dir = _util.getRelWeightsPath(data, use_existing=False)

    tensorboard_writer = tensorboardX.SummaryWriter(weights_dir)
    _getSharedLogger().info("Writing Tensorboard logs to '%s'", weights_dir)
    print()
    print("Try visualizing by running the following:")
    print(f"\ttensorboard --logdir='{weights_dir}'")
    print(
        "Then open the following URL in your local browser. "
        "\n\tIf you're running on a remote machine see `README_TENSORBOARD.md` for help..."
    )

    # REVIEW josephz: Multi-input support doesn't seem ready yet: https://github.com/lanpa/tensorboardX/issues/256
    # tensorboard_writer.add_graph(encoder,
    #   torch.autograd.Variable(
    #     torch.tensor([torch.zeros(batch_size, 100, 68, 3), torch.zeros(batch_size,))))
    # tensorboard_writer.add_graph(decoding_step,
    #   torch.autograd.Variable(
    #     torch.tensor(torch.zeros(batch_size,), torch.zeros(num_layers, batch_size, hidden_size), torch.zeros(batch_size,), torch.zeros(batch_size, 100,
    #       hidden_size))))

    # Train.
    val_cers = []
    train_decoder_losses = []
    train_ctc_losses = []

    best_val_cer = 1.0
    best_val_cer_idx = -1

    # Initial evaluation
    print("Initial evaluation...")
    decoder_loss, val_correct, val_count = _train.eval(encoder, decoding_step,
                                                       val_loader, device,
                                                       train_dataset.char2idx)
    val_cer = (val_count - val_correct).float() / val_count
    print("\tCER: ", str(val_cer))

    encoder_path = os.path.join(weights_dir, "best_encoder.pth")
    decoder_path = os.path.join(weights_dir, "best_decoder.pth")

    num_epochs = 0
    num_annealings = 0

    print("Beginning training loop")
    ts = time.time()
    while val_cer < best_val_cer or num_annealings < annealings:
        print("Epoch {}:".format(num_epochs + 1))

        if num_epochs - best_val_cer_idx > patience:
            # If the model does not improve after our set 'patience' number of epochs, we will reduce the learning rate.
            num_annealings += 1
            learning_rate /= 5
            print(f'\tAnnealing to {learning_rate}')
            restore(encoder, encoder_path)
            restore(decoding_step, decoder_path)

            # Must set best val CER to here, or else this will also trigger next loop
            # if val CER does not go down.
            best_val_cer_idx = num_epochs

        # Apply linear teacher-forcing ratio decay.
        curr_tfr = max(min_tfr, max_tfr - num_epochs / tr_epochs)
        assert 0.0 <= curr_tfr <= 1.0
        print(f'\tCurrent Teacher Forcing Ratio: {curr_tfr}')

        avg_decoder_loss, avg_ctc_loss = _train.train(
            encoder,
            decoding_step,
            train_loader,
            opt=torch.optim.Adam(list(encoder.parameters()) +
                                 list(decoding_step.parameters()),
                                 lr=learning_rate),
            device=device,
            char2idx=train_dataset.char2idx,
            teacher_forcing_ratio=curr_tfr,
            grad_norm=grad_norm)
        print(f'\tAVG Decoder Loss: {avg_decoder_loss}')
        print(f'\tAVG CTC Loss: {avg_ctc_loss}')
        tensorboard_writer.add_scalar(os.path.join(data, 'avg decoder loss'),
                                      avg_decoder_loss,
                                      global_step=num_epochs)
        tensorboard_writer.add_scalar(os.path.join(data, 'avg CTC loss'),
                                      avg_ctc_loss,
                                      global_step=num_epochs)

        decoder_loss, val_correct, val_count = _train.eval(
            encoder, decoding_step, val_loader, device, train_dataset.char2idx)
        _, train_correct, train_count = _train.eval(encoder, decoding_step,
                                                    train_loader, device,
                                                    train_dataset.char2idx)

        val_cer = (val_count - val_correct).float() / val_count
        train_cer = (train_count - train_correct).float() / train_count

        encoder.save_best_model(val_cer, encoder_path)
        decoding_step.save_best_model(val_cer, decoder_path)

        print(f'\tTrain CER: {train_cer}')
        print(f'\tVal CER: {val_cer}')

        # ANALYSIS
        encoder.eval()
        decoding_step.eval()
        with torch.no_grad():
            # CER
            _, test_correct, test_count = _train.eval(encoder, decoding_step,
                                                      test_loader, device,
                                                      train_dataset.char2idx)
            test_cer = (test_count - test_correct).float() / test_count
            print(f'\tTest CER: {train_cer}')

            # Sample teacher forcing output
            print('Some teacher-forcing outputs:')
            _analysis.print_samples(encoder,
                                    decoding_step,
                                    test_loader,
                                    device,
                                    train_dataset.char2idx,
                                    max_=10)

            # confusion matrix
            print('drawing confusion matrix:')
            try:
                _analysis.get_confusion_matrix(encoder, decoding_step,
                                               test_loader, device,
                                               test_dataset.char2idx,
                                               num_epochs)
            except:
                print(
                    'oops something wrong happened in drawing confusion matrix'
                )

            # inference
            print('Some student-forcing outputs with beam search:')
            for frames, frame_lens, chars, char_lens in test_loader:
                frames, frame_lens, chars, char_lens = frames[:
                                                              2], frame_lens[:
                                                                             2], chars[:
                                                                                       2], char_lens[:
                                                                                                     2]
                frames, frame_lens, chars, char_lens = frames.to(
                    device), frame_lens.to(device), chars.to(
                        device), char_lens.to(device)
                pred, gt = _analysis.inference(encoder,
                                               decoding_step,
                                               frames,
                                               frame_lens,
                                               chars,
                                               char_lens,
                                               device,
                                               test_dataset.char2idx,
                                               beam_width=10,
                                               max_label_len=100)
                for gt_, pred_ in zip(gt, pred):
                    print(f'GTL\t: {gt_}')
                    print(f'Pred\t: {pred_}')
                break
        tensorboard_writer.add_scalars(os.path.join(data, 'CER'), {
            "Train": train_cer,
            "Val": val_cer
        },
                                       global_step=num_epochs)
        tensorboard_writer.add_scalar(os.path.join(data, 'learning rate'),
                                      learning_rate,
                                      global_step=num_epochs)

        val_cers.append(val_cer)
        train_decoder_losses.append(avg_decoder_loss)
        train_ctc_losses.append(avg_ctc_loss)

        if val_cer < best_val_cer:
            best_val_cer = val_cer
            best_val_cer_idx = num_epochs

        num_epochs += 1

    te = time.time()
    total_time = te - ts
    print()
    print("Training complete: Took '{}' seconds, or '{}' per epoch".format(
        total_time, total_time / num_epochs))
    print("Training Statistics")
    print("\tBest Val CER: '{}'".format(np.min(val_cers)))
    print("\tBest Decoder Loss: '{}'".format(np.min(train_decoder_losses)))
    print("\tBest CTC Loss: '{}'".format(np.min(train_ctc_losses)))
    print()
Ejemplo n.º 5
0
        return meta

    @staticmethod
    def is_parallel(model):
        return isinstance(model, torch.nn.parallel.DataParallel) or \
               isinstance(model, torch.nn.parallel.DistributedDataParallel)


if __name__ == '__main__':
    import os.path
    import argparse

    parser = argparse.ArgumentParser(
        description='LipReading model information')
    parser.add_argument('--model-path',
                        default=_util.getRelWeightsPath("lipreader", "v0"),
                        help='Path to model file created by training')
    args = parser.parse_args()
    package = torch.load(args.model_path,
                         map_location=lambda storage, loc: storage)
    model = LipReader.load_model(args.model_path)

    print("Model name:         ", os.path.basename(args.model_path))
    print("DeepSpeech version: ", model._version)
    print("")
    print("Recurrent Neural Network Properties")
    print("  RNN Type:         ", model._rnn_type.__name__.lower())
    print("  RNN Layers:       ", model._hidden_layers)
    print("  RNN Size:         ", model._hidden_size)
    print("  Classes:          ", len(model._labels))
    print("")