Exemple #1
0
def create_infer_dags(
    neural_factory, neural_modules, tacotron2_params, infer_dataset, infer_batch_size, cpu_per_dl=1,
):
    (_, text_embedding, t2_enc, t2_dec, t2_postnet, _, _) = neural_modules

    data_layer = nemo_asr.TranscriptDataLayer(
        path=infer_dataset,
        labels=tacotron2_params['labels'],
        batch_size=infer_batch_size,
        num_workers=cpu_per_dl,
        # load_audio=False,
        bos_id=len(tacotron2_params['labels']),
        eos_id=len(tacotron2_params['labels']) + 1,
        pad_id=len(tacotron2_params['labels']) + 2,
        shuffle=False,
    )
    transcript, transcript_len = data_layer()

    transcript_embedded = text_embedding(char_phone=transcript)
    transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len,)
    if isinstance(t2_dec, nemo_tts.Tacotron2DecoderInfer):
        mel_decoder, gate, alignments, mel_len = t2_dec(
            char_phone_encoded=transcript_encoded, encoded_length=transcript_len,
        )
    else:
        raise ValueError("The Neural Module for tacotron2 decoder was not understood")
    mel_postnet = t2_postnet(mel_input=mel_decoder)

    return [mel_postnet, gate, alignments, mel_len]
Exemple #2
0
def create_dag(args, cfg, num_gpus):
    # Defining nodes
    data = nemo_asr.TranscriptDataLayer(
        path=args.train_dataset,
        labels=cfg['target']['labels'],
        eos_id=cfg['target']['eos_id'],
        pad_id=cfg['target']['pad_id'],
        batch_size=cfg['optimization']['batch_size'],
        drop_last=True,
    )
    data_eval = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.eval_datasets,
        labels=cfg['target']['labels'],
        eos_id=cfg['target']['eos_id'],
        batch_size=cfg['inference']['batch_size'],
        load_audio=False,
    )
    decoder = nemo.backends.pytorch.DecoderRNN(
        voc_size=len(cfg['target']['labels']), bos_id=cfg['target']['bos_id'], **cfg['DecoderRNN'],
    )
    num_data = len(data)
    batch_size = cfg['optimization']['batch_size']
    num_epochs = cfg['optimization']['params']['num_epochs']
    steps_per_epoch = int(num_data / (batch_size))
    total_steps = num_epochs * steps_per_epoch
    vsc = ValueSetterCallback
    tf_callback = ValueSetterCallback(
        decoder,
        'teacher_forcing',
        policies=[vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0),],
        total_steps=total_steps,
    )
    seq_loss = nemo.backends.pytorch.SequenceLoss(
        pad_id=cfg['target']['pad_id'], smoothing_coef=cfg['optimization']['smoothing_coef'],
    )
    saver_callback = nemo.core.ModuleSaverCallback(
        save_modules_list=[decoder], folder=args.checkpoint_dir, step_freq=args.checkpoint_save_freq,
    )

    # Creating DAG
    texts, _ = data()
    log_probs, _ = decoder(targets=texts)
    train_loss = seq_loss(log_probs=log_probs, targets=texts)
    evals = []
    _, _, texts, _ = data_eval()
    log_probs, _ = decoder(targets=texts)
    eval_loss = seq_loss(log_probs=log_probs, targets=texts)
    evals.append((args.eval_datasets, (eval_loss, log_probs, texts)))

    # Update config
    cfg['num_params'] = {'decoder': decoder.num_weights}
    cfg['num_params']['total'] = sum(cfg['num_params'].values())
    cfg['input']['train'] = {'num_data': num_data}
    cfg['optimization']['steps_per_epoch'] = steps_per_epoch
    cfg['optimization']['total_steps'] = total_steps

    return (train_loss, evals), cfg, [tf_callback, saver_callback]