def main():
    from utils import hparams as hp
    parser = argparse.ArgumentParser()
    parser.add_argument('--hp_file', metavar='FILE', default='hparams.py')
    parser.add_argument('--train_script', default=None)
    args = parser.parse_args()

    hp.configure(args.hp_file)
    if args.train_script is not None:
        hp.train_script = args.train_script
    print(f'train script = {hp.train_script}')
    datasets = TrainDatasets(hp.train_script, hp)
    sampler = LengthsBatchSampler(datasets,
                                  hp.max_seqlen,
                                  hp.lengths_file,
                                  shuffle=True,
                                  shuffle_one_time=False,
                                  shuffle_all=False)
    dataloader = DataLoader(datasets,
                            batch_sampler=sampler,
                            num_workers=4,
                            collate_fn=collate_fn)
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    from tqdm import tqdm
    pbar = tqdm(dataloader)
    for d in pbar:
        text, wav_input, pos_text, pos_wav2vec2, text_lengths, wav2vec2_lengths = d

        text = text.to(DEVICE, non_blocking=True)
        wav_input = wav_input.to(DEVICE, non_blocking=True)
        pos_text = pos_text.to(DEVICE, non_blocking=True)
        pos_wav2vec2 = pos_wav2vec2.to(DEVICE, non_blocking=True)
        text_lengths = text_lengths.to(DEVICE, non_blocking=True)
def synthesize(text):
    input = text + "|00-" + lang + "|" + lang

    # Change to Multi_TTS path
    sys.path.append(
        os.path.join(os.path.dirname(__file__),
                     "dependencies/Multilingual_Text_to_Speech"))

    if "utils" in sys.modules: del sys.modules["utils"]

    from synthesize import synthesize
    from utils import build_model

    # Load Mulilingual pretrained model
    model = build_model(
        os.path.abspath("./dependencies/checkpoints/generated_switching.pyt"))
    model.eval()

    # generate spectogram
    spectogram = synthesize(model, "|" + input)

    # Change to WaveRNN Path
    sys.path.append(
        os.path.join(os.path.dirname(__file__), "dependencies/WaveRNN"))

    if "utils" in sys.modules: del sys.modules["utils"]

    from models.fatchord_version import WaveRNN
    from utils import hparams as hp
    from gen_wavernn import generate
    import torch

    # Load WaveRNN pretrained model
    hp.configure("hparams.py")
    model = WaveRNN(
        rnn_dims=hp.voc_rnn_dims,
        fc_dims=hp.voc_fc_dims,
        bits=hp.bits,
        pad=hp.voc_pad,
        upsample_factors=hp.voc_upsample_factors,
        feat_dims=hp.num_mels,
        compute_dims=hp.voc_compute_dims,
        res_out_dims=hp.voc_res_out_dims,
        res_blocks=hp.voc_res_blocks,
        hop_length=hp.hop_length,
        sample_rate=hp.sample_rate,
        mode=hp.voc_mode).to(
            torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    model.load(
        os.path.join(os.path.dirname(__file__),
                     "dependencies/checkpoints/wavernn_weight.pyt"))

    waveform = generate(model, s, hp.voc_gen_batched, hp.voc_target,
                        hp.voc_overlap)

    f = write("./temp/result.wav", "x")
    f.write(waveform)
    f.close()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--hp_file', type=str, default='hparams.py')
    args = parser.parse_args()
    hp.configure(args.hp_file)
    fill_variables(hp)
    log_config(hp)

    os.makedirs(hp.save_dir, exist_ok=True)

    n_gpus = torch.cuda.device_count()
    args.__setattr__('n_gpus', n_gpus)

    if n_gpus > 1:
        run_distributed(run_training, args, hp)
    else:
        run_training(0, args, hp, None)
Exemple #4
0
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train Tacotron TTS')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--force_gta',
                        '-g',
                        action='store_true',
                        help='Force the model to create GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # Load hparams from file
    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    force_train = args.force_train
    force_gta = args.force_gta

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        for session in hp.tts_schedule:
            _, _, _, batch_size = session
            if batch_size % torch.cuda.device_count() != 0:
                raise ValueError(
                    '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    # Instantiate Tacotron Model
    print('\nInitialising Tacotron Model...\n')
    model = Tacotron(embed_dims=hp.tts_embed_dims,
                     num_chars=len(symbols),
                     encoder_dims=hp.tts_encoder_dims,
                     decoder_dims=hp.tts_decoder_dims,
                     n_mels=hp.num_mels,
                     fft_bins=hp.num_mels,
                     postnet_dims=hp.tts_postnet_dims,
                     encoder_K=hp.tts_encoder_K,
                     lstm_dims=hp.tts_lstm_dims,
                     postnet_K=hp.tts_postnet_K,
                     num_highways=hp.tts_num_highways,
                     dropout=hp.tts_dropout,
                     stop_threshold=hp.tts_stop_threshold).to(device)

    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True)

    if not force_gta:
        for i, session in enumerate(hp.tts_schedule):
            current_step = model.get_step()

            r, lr, max_step, batch_size = session

            training_steps = max_step - current_step

            # Do we need to change to the next session?
            if current_step >= max_step:
                # Are there no further sessions than the current one?
                if i == len(hp.tts_schedule) - 1:
                    # There are no more sessions. Check if we force training.
                    if force_train:
                        # Don't finish the loop - train forever
                        training_steps = 999_999_999
                    else:
                        # We have completed training. Breaking is same as continue
                        break
                else:
                    # There is a following session, go to it
                    continue

            model.r = r

            simple_table([('Steps with r=%s' % (repr1(r)),
                           str(training_steps // 1000) + 'k Steps'),
                          ('Batch Size', batch_size), ('Learning Rate', lr),
                          ('Outputs/Step (r)', model.r)])

            train_set, attn_example = get_tts_datasets(paths.data, batch_size,
                                                       r)
            tts_train_loop(paths, model, optimizer, train_set, lr,
                           training_steps, attn_example)

        print('Training Complete.')
        print(
            'To continue training increase tts_total_steps in hparams.py or use --force_train\n'
        )

    print('Creating Ground Truth Aligned Dataset...\n')

    train_set, attn_example = get_tts_datasets(paths.data, 8, model.r)
    create_gta_features(model, train_set, paths.gta)

    print(
        '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n'
    )
    def __iter__(self):
        if self.shuffle:
            np.random.shuffle(self.all_indices)

        for indices in self.all_indices:
            yield indices

    def __len__(self):
        return len(self.all_indices)

def get_dataset(script_file='examples/LJSpeech/data/train/script_16000/train_id_sort_xlen.txt'):
    print(f'script_file = {script_file}')
    return TrainDatasets(script_file)

def get_test_dataset(script_file='examples/LJSpeech/data/dev/script_16000/dev_id.txt'):
    print(f'script_file = {script_file}')
    return TestDatasets(script_file)

if __name__ == '__main__':
    hp.configure('configs/hparams_LJSpeech.py')
    datasets = get_test_dataset('examples/LJSpeech/data/dev/script_16000/dev_id.txt')

    sampler = NumBatchSampler(datasets, 1, shuffle=False)
    dataloader = DataLoader(datasets, batch_sampler=sampler, num_workers=4, collate_fn=collate_fn_test)

    from tqdm import tqdm
    pbar = tqdm(dataloader)
    for d in pbar:
        print(d[1])
Exemple #6
0
                        '-u',
                        dest='batched',
                        action='store_false',
                        help='Slower Unbatched Generation (better quality)')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure('hparams.py')  # Load hparams from file

    parser.set_defaults(batched=True)
    parser.set_defaults(input_text=None)

    batched = args.batched
    input_text = args.input_text

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    print('\nInitialising WaveRNN Model...\n')
Exemple #7
0
    parser.add_argument('--beam_width', type=int, default=None)
    parser.add_argument('--log_params', action='store_true')
    parser.add_argument('--calc_wer', action='store_true')
    parser.add_argument('--segment', type=int, default=10000)
    parser.add_argument('--silence_file', type=str, default=None)
    parser.add_argument('--lm_type', type=str, default='LSTM')
    args = parser.parse_args()
    hp_file = args.hp_file
    model_name = args.load_name  # save dir name

    model_path = os.path.dirname(model_name)

    if hp_file is None:
        hp_file = os.path.join(model_path, 'hparams.py')

    hp.configure(hp_file)
    fill_variables(hp)

    setattr(hp, 'silence_file', args.silence_file)

    if args.beam_width is not None:
        print(f'beam width is set to {args.beam_width}')
        hp.beam_width = args.beam_width

    script_file = hp.eval_file
    if args.test_script is not None:
        script_file = args.test_script

    if hp.lm_weight is not None:
        args.lm_weight = hp.lm_weight
    print(f'lm weight = {args.lm_weight}')
                    default='.wav',
                    help='file extension to search for in dataset folder')
parser.add_argument(
    '--num_workers',
    '-w',
    metavar='N',
    type=valid_n_workers,
    default=cpu_count() - 1,
    help='The number of worker threads to use for preprocessing')
parser.add_argument('--hp_file',
                    metavar='FILE',
                    default='hparams.py',
                    help='The file to use for the hyperparameters')
args = parser.parse_args()

hp.configure(args.hp_file)  # Load hparams from file
if args.path is None:
    args.path = hp.wav_path

extension = args.extension
path = args.path


def convert_file(path: Path):
    y = load_wav(path)
    peak = np.abs(y).max()
    if hp.peak_norm or peak > 1.0:
        y /= peak
    mel = melspectrogram(y)
    if hp.voc_mode == 'RAW':
        quant = encode_mu_law(
Exemple #9
0
def thak():
    class Tshamsoo():
        force_cpu = os.getenv('FORCE_CPU', False)
        hp_file = 'hparams.py'
        vocoder = os.getenv('VOCODER', 'wavernn')
        batched = os.getenv('BATCHED', True)
        target = os.getenv('TARGET', None)
        overlap = os.getenv('OVERLAP', None)
        tts_weights = None
        save_attn = os.getenv('SAVE_ATTN', False)
        voc_weights = None
        iters = os.getenv('GL_ITERS', 32)

    args = Tshamsoo()
    if args.vocoder in ['griffinlim', 'gl']:
        args.vocoder = 'griffinlim'
    elif args.vocoder in ['wavernn', 'wr']:
        args.vocoder = 'wavernn'
    else:
        raise argparse.ArgumentError('Must provide a valid vocoder type!')

    hp.configure(args.hp_file)  # Load hparams from file

    tts_weights = args.tts_weights
    save_attn = args.save_attn

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    if args.vocoder == 'wavernn':
        # set defaults for any arguments that depend on hparams
        if args.target is None:
            args.target = hp.voc_target
        if args.overlap is None:
            args.overlap = hp.voc_overlap
        if args.batched is None:
            args.batched = hp.voc_gen_batched

        batched = args.batched
        target = int(args.target)
        overlap = int(args.overlap)

        print('\nInitialising WaveRNN Model...\n')
        # Instantiate WaveRNN Model
        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                            fc_dims=hp.voc_fc_dims,
                            bits=hp.bits,
                            pad=hp.voc_pad,
                            upsample_factors=hp.voc_upsample_factors,
                            feat_dims=hp.num_mels,
                            compute_dims=hp.voc_compute_dims,
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode=hp.voc_mode).to(device)

        voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights
        voc_model.load(voc_load_path)
    else:
        voc_model = None
        batched = None
        target = None
        overlap = None

    print('\nInitialising Tacotron Model...\n')

    # Instantiate Tacotron Model
    tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                         num_chars=len(symbols),
                         encoder_dims=hp.tts_encoder_dims,
                         decoder_dims=hp.tts_decoder_dims,
                         n_mels=hp.num_mels,
                         fft_bins=hp.num_mels,
                         postnet_dims=hp.tts_postnet_dims,
                         encoder_K=hp.tts_encoder_K,
                         lstm_dims=hp.tts_lstm_dims,
                         postnet_K=hp.tts_postnet_K,
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout,
                         stop_threshold=hp.tts_stop_threshold).to(device)

    tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights
    tts_model.load(tts_load_path)
    return args, voc_model, tts_model, batched, target, overlap, save_attn
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='TTS Generator')

    parser.add_argument(
        '--tts_weights',
        type=str,
        help='[string/path] Load in different FastSpeech weights')

    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    parser.add_argument(
        '--alpha',
        type=float,
        default=1.,
        help='Parameter for controlling length regulator for speedup '
        'or slow-down of generated speech, e.g. alpha=2.0 is double-time')

    if not os.path.exists('onnx'):
        os.mkdir('onnx')

    args = parser.parse_args()

    hp.configure(args.hp_file)

    input_text = "the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."
    tts_weights = args.tts_weights

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    device = torch.device('cpu')
    print('Using device:', device)

    print('\nInitialising Forward TTS Model...\n')
    tts_model = ForwardTacotron(embed_dims=hp.forward_embed_dims,
                                num_chars=len(symbols),
                                durpred_rnn_dims=hp.forward_durpred_rnn_dims,
                                durpred_conv_dims=hp.forward_durpred_conv_dims,
                                rnn_dim=hp.forward_rnn_dims,
                                postnet_k=hp.forward_postnet_K,
                                postnet_dims=hp.forward_postnet_dims,
                                prenet_k=hp.forward_prenet_K,
                                prenet_dims=hp.forward_prenet_dims,
                                highways=hp.forward_num_highways,
                                dropout=hp.forward_dropout,
                                n_mels=hp.num_mels).to(device)

    tts_load_path = tts_weights or paths.forward_latest_weights
    tts_model.load(tts_load_path)

    encoder = DurationPredictor(tts_model)
    decoder = Tacotron(tts_model)

    tts_model.eval()
    encoder.eval()
    decoder.eval()

    opset_version = 10

    with torch.no_grad():
        input_seq = text_to_sequence(input_text.strip(), hp.tts_cleaner_names)
        input_seq = torch.as_tensor(input_seq, dtype=torch.long,
                                    device=device).unsqueeze(0)
        '''
        FIRST STEP: predict symbols duration
        '''
        torch.onnx.export(encoder,
                          input_seq,
                          "./onnx/forward_tacotron_duration_prediction.onnx",
                          opset_version=opset_version,
                          do_constant_folding=True,
                          input_names=["input_seq"],
                          output_names=["embeddings", "duration"])

        x, durations = encoder(input_seq)
        '''
        SECOND STEP: expand symbols by durations
        '''
        x = encoder.lr(x, durations)
        '''
        THIRD STEP: generate mel
        '''
        torch.onnx.export(decoder,
                          x,
                          "./onnx/forward_tacotron_regression.onnx",
                          opset_version=opset_version,
                          do_constant_folding=True,
                          input_names=["data"],
                          output_names=["mel"])

    print('Done!')
def get_dataset(
    script_file='examples/LJSpeech/data/train/script_16000/train_id_sort_xlen.txt'
):
    print(f'script_file = {script_file}')
    return TrainDatasets(script_file)


def get_test_dataset(
        script_file='examples/LJSpeech/data/dev/script_16000/dev_id.txt'):
    print(f'script_file = {script_file}')
    return TestDatasets(script_file)


if __name__ == '__main__':
    hp.configure('configs/fastSpeech2/hparams_LJ_melgan.py')
    datasets = get_dataset(hp.train_script)

    sampler = NumBatchSampler(datasets, 3, shuffle=False)
    dataloader = DataLoader(datasets,
                            batch_sampler=sampler,
                            num_workers=4,
                            collate_fn=collate_fn)

    from tqdm import tqdm
    pbar = tqdm(dataloader)
    for dic in pbar:
        import pdb
        pdb.set_trace()
        print(dic[1])
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train Tacotron TTS')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--force_gta',
                        '-g',
                        action='store_true',
                        help='Force the model to create GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # Load hparams from file

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    force_gta = args.force_gta

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        for session in hp.forward_schedule:
            _, _, batch_size = session
            if batch_size % torch.cuda.device_count() != 0:
                raise ValueError(
                    '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    # Instantiate Forward TTS Model
    print('\nInitialising Forward TTS Model...\n')
    model = ForwardTacotron(embed_dims=hp.forward_embed_dims,
                            num_chars=len(symbols),
                            durpred_rnn_dims=hp.forward_durpred_rnn_dims,
                            durpred_conv_dims=hp.forward_durpred_conv_dims,
                            rnn_dim=hp.forward_rnn_dims,
                            postnet_k=hp.forward_postnet_K,
                            postnet_dims=hp.forward_postnet_dims,
                            prenet_k=hp.forward_prenet_K,
                            prenet_dims=hp.forward_prenet_dims,
                            highways=hp.forward_num_highways,
                            dropout=hp.forward_dropout,
                            n_mels=hp.num_mels).to(device)

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print(f'num params {params}')

    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('forward',
                       paths,
                       model,
                       optimizer,
                       create_if_missing=True)

    if not force_gta:
        for i, session in enumerate(hp.forward_schedule):
            current_step = model.get_step()

            lr, max_step, batch_size = session

            training_steps = max_step - current_step

            simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                          ('Batch Size', batch_size), ('Learning Rate', lr)])

            train_set, mel_example = get_tts_datasets(paths.data,
                                                      batch_size,
                                                      1,
                                                      alignments=True)
            train_loop(paths, model, optimizer, train_set, lr, training_steps,
                       mel_example)

    train_set, mel_example = get_tts_datasets(paths.data,
                                              8,
                                              1,
                                              alignments=True)
    create_gta_features(model, train_set, paths.gta)
    print('Training Complete.')
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='TTS Generator')

    parser.add_argument('--mel',
                        type=str,
                        help='[string/path] path to test mel file')

    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')

    parser.add_argument('--batched',
                        '-b',
                        dest='batched',
                        action='store_true',
                        help='Fast Batched Generation')

    parser.add_argument(
        '--voc_weights',
        type=str,
        help='[string/path] Load in different FastSpeech weights',
        default="pretrained/wave_800K.pyt")

    args = parser.parse_args()

    if not os.path.exists('onnx'):
        os.mkdir('onnx')

    hp.configure(args.hp_file)

    device = torch.device('cpu')
    print('Using device:', device)

    #####
    print('\nInitialising WaveRNN Model...\n')
    # Instantiate WaveRNN Model
    voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                        fc_dims=hp.voc_fc_dims,
                        bits=hp.bits,
                        pad=hp.voc_pad,
                        upsample_factors=hp.voc_upsample_factors,
                        feat_dims=hp.num_mels,
                        compute_dims=hp.voc_compute_dims,
                        res_out_dims=hp.voc_res_out_dims,
                        res_blocks=hp.voc_res_blocks,
                        hop_length=hp.hop_length,
                        sample_rate=hp.sample_rate,
                        mode=hp.voc_mode).to(device)

    voc_load_path = args.voc_weights
    voc_model.load(voc_load_path)

    voc_upsampler = WaveRNNUpsamplerONNX(voc_model, args.batched,
                                         hp.voc_target, hp.voc_overlap)
    voc_infer = WaveRNNONNX(voc_model)

    voc_model.eval()
    voc_upsampler.eval()
    voc_infer.eval()

    opset_version = 11

    with torch.no_grad():
        mels = np.load(args.mel)
        mels = torch.from_numpy(mels)
        mels = mels.unsqueeze(0)
        mels = voc_upsampler.pad_tensor(mels)

        mels_onnx = mels.clone()

        torch.onnx.export(voc_upsampler,
                          mels_onnx,
                          "./onnx/wavernn_upsampler.onnx",
                          opset_version=opset_version,
                          do_constant_folding=True,
                          input_names=["mels"],
                          output_names=["upsample_mels", "aux"])

        mels, aux = voc_upsampler(mels)
        mels = mels[:, 550:-550, :]

        mels, aux = voc_upsampler.fold(mels, aux)

        h1, h2, x = voc_infer.get_initial_parameters(mels)

        aux_split = voc_infer.split_aux(aux)

        b_size, seq_len, _ = mels.size()

        if seq_len:
            m_t = mels[:, 0, :]

            a1_t, a2_t, a3_t, a4_t = \
                (a[:, 0, :] for a in aux_split)

            rnn_input = (m_t, a1_t, a2_t, a3_t, a4_t, h1, h2, x)
            torch.onnx.export(voc_infer,
                              rnn_input,
                              "./onnx/wavernn_rnn.onnx",
                              opset_version=opset_version,
                              do_constant_folding=True,
                              input_names=[
                                  "m_t", "a1_t", "a2_t", "a3_t", "a4_t", "h1",
                                  "h2", "x"
                              ],
                              output_names=["logits", "h1", "h2"])

    print('Done!')
def init_hparams(hp_file):
    hp.configure(hp_file)
Exemple #15
0
    def __init__(self):
        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS')
        self.args = parser.parse_args()
        self.args.vocoder = 'wavernn'
        self.args.hp_file = 'hparams.py'
        self.args.voc_weights = False
        self.args.tts_weights = False
        self.args.save_attn = False
        self.args.batched = True
        self.args.target = None
        self.args.overlap = None
        self.args.force_cpu = False
        #================ vocoder ================#
        if self.args.vocoder in ['griffinlim', 'gl']:
            self.args.vocoder = 'griffinlim'
        elif self.args.vocoder in ['wavernn', 'wr']:
            self.args.vocoder = 'wavernn'
        else:
            raise argparse.ArgumentError('Must provide a valid vocoder type!')

        hp.configure(self.args.hp_file)  # Load hparams from file

        # set defaults for any arguments that depend on hparams
        if self.args.vocoder == 'wavernn':
            if self.args.target is None:
                self.args.target = hp.voc_target
            if self.args.overlap is None:
                self.args.overlap = hp.voc_overlap
            if self.args.batched is None:
                self.args.batched = hp.voc_gen_batched

        #================ others ================#
        paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)
        print("hello")
        print(paths.base)
        if not self.args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        # === Wavernn === #
        if self.args.vocoder == 'wavernn':
            print('\nInitialising WaveRNN Model...\n')
            self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                                     fc_dims=hp.voc_fc_dims,
                                     bits=hp.bits,
                                     pad=hp.voc_pad,
                                     upsample_factors=hp.voc_upsample_factors,
                                     feat_dims=hp.num_mels,
                                     compute_dims=hp.voc_compute_dims,
                                     res_out_dims=hp.voc_res_out_dims,
                                     res_blocks=hp.voc_res_blocks,
                                     hop_length=hp.hop_length,
                                     sample_rate=hp.sample_rate,
                                     mode=hp.voc_mode).to(device)

            voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights
            #print(paths.voc_latest_weights)
            self.voc_model.load(voc_load_path)

        # === Tacotron === #
        if hp.tts_model == 'tacotron':
            print('\nInitialising Tacotron Model...\n')
            self.tts_model = Tacotron(
                embed_dims=hp.tts_embed_dims,
                num_chars=len(symbols),
                encoder_dims=hp.tts_encoder_dims,
                decoder_dims=hp.tts_decoder_dims,
                n_mels=hp.num_mels,
                fft_bins=hp.num_mels,
                postnet_dims=hp.tts_postnet_dims,
                encoder_K=hp.tts_encoder_K,
                lstm_dims=hp.tts_lstm_dims,
                postnet_K=hp.tts_postnet_K,
                num_highways=hp.tts_num_highways,
                dropout=hp.tts_dropout,
                stop_threshold=hp.tts_stop_threshold).to(device)

            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Tacotron2 === #
        elif hp.tts_model == 'tacotron2':
            print('\nInitializing Tacotron2 Model...\n')
            self.tts_model = Tacotron2().to(device)
            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Infomation === #
        if hp.tts_model == 'tacotron':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron', str(tts_k) + 'k'),
                              ('r', self.tts_model.r),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])

        elif hp.tts_model == 'tacotron2':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron2', str(tts_k) + 'k'),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron2', str(tts_k) + 'k'),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])
Exemple #16
0
def main():

    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train WaveRNN Vocoder')
    parser.add_argument('--lr',
                        '-l',
                        type=float,
                        help='[float] override hparams.py learning rate')
    parser.add_argument('--batch_size',
                        '-b',
                        type=int,
                        help='[int] override hparams.py batch size')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--gta',
                        '-g',
                        action='store_true',
                        help='train wavernn on GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # load hparams from file
    if args.lr is None:
        args.lr = hp.voc_lr
    if args.batch_size is None:
        args.batch_size = hp.voc_batch_size

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    batch_size = args.batch_size
    force_train = args.force_train
    train_gta = args.gta
    lr = args.lr

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        if batch_size % torch.cuda.device_count() != 0:
            raise ValueError(
                '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    print('\nInitialising Model...\n')

    # Instantiate WaveRNN Model
    voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                        fc_dims=hp.voc_fc_dims,
                        bits=hp.bits,
                        pad=hp.voc_pad,
                        upsample_factors=hp.voc_upsample_factors,
                        feat_dims=hp.num_mels,
                        compute_dims=hp.voc_compute_dims,
                        res_out_dims=hp.voc_res_out_dims,
                        res_blocks=hp.voc_res_blocks,
                        hop_length=hp.hop_length,
                        sample_rate=hp.sample_rate,
                        mode=hp.voc_mode).to(device)

    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    optimizer = optim.Adam(voc_model.parameters())
    restore_checkpoint('voc',
                       paths,
                       voc_model,
                       optimizer,
                       create_if_missing=True)

    train_set, test_set = get_vocoder_datasets(paths.data, batch_size,
                                               train_gta)

    total_steps = 10_000_000 if force_train else hp.voc_total_steps

    simple_table([
        ('Remaining', str(
            (total_steps - voc_model.get_step()) // 1000) + 'k Steps'),
        ('Batch Size', batch_size), ('LR', lr),
        ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta)
    ])

    loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss

    voc_train_loop(paths, voc_model, loss_func, optimizer, train_set, test_set,
                   lr, total_steps)

    print('Training Complete.')
    print(
        'To continue training increase voc_total_steps in hparams.py or use --force_train'
    )
Exemple #17
0
        wav = wav / 32768.0
    elif wav.dtype == np.int32:
        wav = wav / 2147483648.0
    elif wav.dtype == np.uint8:
        wav = (wav - 128) / 128.0

    wav = wav.astype(np.float32)

    return sr, wav


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--hp_file', metavar='FILE', default='hparams.py')
    parser.add_argument('-d',
                        '--data_path',
                        type=str,
                        required=True,
                        help="root directory of wav files")
    parser.add_argument('-o',
                        '--out_path',
                        type=str,
                        default=None,
                        help="save directory of mel files")
    args = parser.parse_args()
    if args.out_path is None:
        args.out_path = args.data_path
    hp.configure(args.hp_file)

    main(hp, args)
Exemple #18
0
    os.chdir(CHECKPOINTS_FOLDER)
    os.system("curl -O -L 'https://github.com/Tomiinek/Multilingual_Text_to_Speech/releases/download/v1.0/" +wavernn_chpt+"'")


print("Cur Dir", os.getcwd())

if "utils" in sys.modules:
    del sys.modules["utils"]

sys.path.append(WAVERNN_FOLDER)

from gen_wavernn import generate
from utils import hparams as hp
from models.fatchord_version import WaveRNN

hp.configure(WAVERNN_FOLDER+'/hparams.py')
model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors,
                feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks,
                hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to('cpu')
model.load(CHECKPOINTS_FOLDER + "/" + wavernn_chpt)

y = []

ix=1
while os.path.exists(CHR_FOLDER+"/"+str(ix)+".npy"):
    print("Found", CHR_FOLDER+"/"+str(ix)+".npy")
    y.append(np.load(CHR_FOLDER+"/"+str(ix)+".npy"))
    ix+=1

idx=1
for s in y:
Exemple #19
0
def main_work():


   parser = argparse.ArgumentParser(description='Get durations for Tacotron')
   parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters')
   parser.add_argument('--model_name', default='taco', help='taco or dctts')

   args = parser.parse_args()


   hp.configure(args.hp_file)  # Load hparams from file
   model = args.model_name

   if model == "dctts":
       time_step = 50
   else:
       time_step = 12.5


   transcript_file = Path(f'{hp.data_path}/{hp.metadata}')
   outfile = Path(f'{hp.data_path}/train_durations.csv')
   transcript = read_transcript(transcript_file)

   # check if label files exist
   if not os.path.exists(f'{hp.data_path}/labels/label_state_align/'):
       print("No label_state_align directory found!")
       exit()
   if len(os.listdir(f'{hp.data_path}/labels/label_state_align/')) == 0:
       print(f'{hp.data_path}/labels/label_state_align/ is empty')
       exit()
   if not os.path.exists(f'{hp.data_path}/mel'):
        print("No mel directory found!")
        exit()
   if len(os.listdir(f'{hp.data_path}/mel')) == 0:
        print(f'{hp.data_path}/mel is empty')
        exit()

   for labfile in os.listdir(f'{hp.data_path}/labels/label_state_align/'):
       print(f'Processing {labfile} ... ')
       labfile = Path(labfile)
       #os.makedirs(f'{hp.data_path}/attention_guides_dctts', exist_ok=True)
       #out_guide_file = Path(f'{hp.data_path}/attention_guides_dctts/{labfile.stem}.npy')
       os.makedirs(f'{hp.data_path}/attention_guides', exist_ok=True)
       out_guide_file = Path(f'{hp.data_path}/attention_guides/{labfile.stem}.npy')

       labfile = Path(os.path.join(f'{hp.data_path}/labels/label_state_align/',labfile))
       (mono, lengths) =   merlin_state_label_to_monophones(labfile)


       mel_file = labfile.stem
      

       # NOTE THE DIMENSIONS -- dctts nframe is in [0] and taco is in [1]
       mel_features = np.load(f'{hp.data_path}/mel/{mel_file}.npy')
       if model == "dctts":
           audio_msec_length = mel_features.shape[0] * time_step
       else:
           audio_msec_length = mel_features.shape[1] * time_step

       resampled_lengths = resample_timings(lengths, 5.0, time_step, total_duration=audio_msec_length)



       if resampled_lengths is not None:
           resampled_lengths_in_frames = (resampled_lengths / time_step).astype(int)


           timings = match_up((mono, resampled_lengths_in_frames), transcript[labfile.stem]['phones'])


           assert len(transcript[labfile.stem]['phones']) == len(timings), (len(transcript[labfile.stem]['phones']), len(timings), transcript[labfile.stem]['phones'], timings)
           transcript[labfile.stem]['duration'] = timings

           guided_attention_matrix = durations_to_attention_matrix(np.array(timings))

           save_guided_attention(guided_attention_matrix, out_guide_file)

       else:
           print(f'{labfile} was not successfully processed!')

   write_transcript(transcript, outfile, duration=True)