Ejemplo n.º 1
0
def get_tts_datasets(path: Path, batch_size, r, model_type='tacotron'):
    train_data = unpickle_binary(path / 'train_dataset.pkl')
    val_data = unpickle_binary(path / 'val_dataset.pkl')
    train_ids, train_lens = filter_max_len(train_data)
    val_ids, val_lens = filter_max_len(val_data)
    text_dict = unpickle_binary(path / 'text_dict.pkl')
    if model_type == 'tacotron':
        train_dataset = TacoDataset(path, train_ids, text_dict)
        val_dataset = TacoDataset(path, val_ids, text_dict)
    elif model_type == 'forward':
        train_dataset = ForwardDataset(path, train_ids, text_dict)
        val_dataset = ForwardDataset(path, val_ids, text_dict)
    else:
        raise ValueError(
            f'Unknown model: {model_type}, must be either [tacotron, forward]!'
        )

    train_sampler = BinnedLengthSampler(train_lens, batch_size, batch_size * 3)

    train_set = DataLoader(train_dataset,
                           collate_fn=lambda batch: collate_tts(batch, r),
                           batch_size=batch_size,
                           sampler=train_sampler,
                           num_workers=1,
                           pin_memory=True)

    val_set = DataLoader(val_dataset,
                         collate_fn=lambda batch: collate_tts(batch, r),
                         batch_size=batch_size,
                         sampler=None,
                         num_workers=1,
                         shuffle=False,
                         pin_memory=True)

    return train_set, val_set
Ejemplo n.º 2
0
def extract_pitch(save_path: Path) -> Tuple[float, float]:
    train_data = unpickle_binary('data/train_dataset.pkl')
    val_data = unpickle_binary('data/val_dataset.pkl')
    all_data = filter_max_len(train_data + val_data)
    phoneme_pitches = []
    for prog_idx, (item_id, mel_len) in enumerate(all_data, 1):
        dur = np.load(paths.alg / f'{item_id}.npy')
        assert np.sum(dur) == mel_len

        pitch = np.load(paths.raw_pitch / f'{item_id}.npy')

        durs_cum = np.cumsum(np.pad(dur, (1, 0)))
        pitch_char = np.zeros((dur.shape[0], ), dtype=np.float)
        for idx, a, b in zip(range(mel_len), durs_cum[:-1], durs_cum[1:]):
            values = pitch[a:b][np.where(pitch[a:b] != 0.0)[0]]
            values = values[np.where(values < hp.pitch_max_freq)[0]]
            pitch_char[idx] = np.mean(values) if len(values) > 0 else 0.0
        phoneme_pitches.append((item_id, pitch_char))
        bar = progbar(prog_idx, len(all_data))
        msg = f'{bar} {prog_idx}/{len(all_data)} Files '
        stream(msg)

    mean, var = normalize_pitch(phoneme_pitches)
    for item_id, phoneme_pitch in phoneme_pitches:
        np.save(str(save_path / f'{item_id}.npy'),
                phoneme_pitch,
                allow_pickle=False)

    print(f'\nPitch mean: {mean} var: {var}')
    return mean, var
Ejemplo n.º 3
0
def get_tts_datasets(path: Path, batch_size, r, model_type='tacotron'):
    train_data = unpickle_binary(path / 'train_dataset.pkl')
    val_data = unpickle_binary(path / 'val_dataset.pkl')
    text_dict = unpickle_binary(path / 'text_dict.pkl')

    train_data = filter_max_len(train_data)
    val_data = filter_max_len(val_data)
    train_len_original = len(train_data)

    if model_type == 'forward' and hp.forward_filter_attention:
        attention_score_dict = unpickle_binary(path / 'att_score_dict.pkl')
        train_data = filter_bad_attentions(train_data, attention_score_dict)
        val_data = filter_bad_attentions(val_data, attention_score_dict)
        print(
            f'Using {len(train_data)} train files. '
            f'Filtered {train_len_original - len(train_data)} files due to bad attention!'
        )

    train_ids, train_lens = zip(*train_data)
    val_ids, val_lens = zip(*val_data)
    if model_type == 'tacotron':
        train_dataset = TacoDataset(path, train_ids, text_dict)
        val_dataset = TacoDataset(path, val_ids, text_dict)
    elif model_type == 'forward':
        train_dataset = ForwardDataset(path, train_ids, text_dict)
        val_dataset = ForwardDataset(path, val_ids, text_dict)
    else:
        raise ValueError(
            f'Unknown model: {model_type}, must be either [tacotron, forward]!'
        )

    train_sampler = BinnedLengthSampler(train_lens, batch_size, batch_size * 3)

    train_set = DataLoader(train_dataset,
                           collate_fn=lambda batch: collate_tts(batch, r),
                           batch_size=batch_size,
                           sampler=train_sampler,
                           num_workers=0,
                           pin_memory=True)

    val_set = DataLoader(val_dataset,
                         collate_fn=lambda batch: collate_tts(batch, r),
                         batch_size=batch_size,
                         sampler=None,
                         num_workers=0,
                         shuffle=False,
                         pin_memory=True)

    return train_set, val_set
Ejemplo n.º 4
0
def get_vocoder_datasets(path: Path,
                         batch_size:int,
                         train_gta: bool,
                         max_mel_len: int,
                         hop_length: int,
                         voc_pad: int,
                         voc_seq_len: int,
                         voc_mode: str,
                         bits: int,
                         num_gen_samples: int):
    train_data = unpickle_binary(path/'train_dataset.pkl')
    val_data = unpickle_binary(path/'val_dataset.pkl')
    train_ids, train_lens = zip(*filter_max_len(train_data, max_mel_len))
    val_ids, val_lens = zip(*filter_max_len(val_data, max_mel_len))
    train_dataset = VocoderDataset(path, train_ids, train_gta)
    val_dataset = VocoderDataset(path, val_ids, train_gta)
    voc_collator = VocCollator(hop_length=hop_length,
                               voc_pad=voc_pad,
                               voc_seq_len=voc_seq_len,
                               voc_mode=voc_mode,
                               bits=bits)
    train_set = DataLoader(train_dataset,
                           collate_fn=voc_collator,
                           batch_size=batch_size,
                           num_workers=0,
                           shuffle=True,
                           pin_memory=True)

    val_set = DataLoader(val_dataset,
                         collate_fn=voc_collator,
                         batch_size=batch_size,
                         num_workers=0,
                         shuffle=False,
                         pin_memory=True)

    np.random.seed(42)  # fix numpy seed to obtain the same val set every time, I know its hacky
    val_set = [b for b in val_set]
    np.random.seed()

    val_set_samples = DataLoader(val_dataset,
                                 batch_size=1,
                                 num_workers=0,
                                 shuffle=False,
                                 pin_memory=True)

    val_set_samples = [s for i, s in enumerate(val_set_samples)
                       if i < num_gen_samples]

    return train_set, val_set, val_set_samples
Ejemplo n.º 5
0
    def train(self, model_tts: ForwardTacotron, model_asr: Wav2Vec2ForCTC,
              optimizer_tts: Optimizer, optimizer_asr: Optimizer) -> None:
        print("Loading ASR training data...")
        asr_train_set = unpickle_binary('./data/speech-sme-asr/train_asr.pkl')
        asr_test_set = unpickle_binary('./data/speech-sme-asr/test_asr.pkl')
        # exit()
        asr_trainer = init_trainer(asr_train_set, asr_test_set)

        for i, session_params in enumerate(hp.forward_schedule, 1):
            lr, max_step, bs = session_params
            if model_tts.get_step() < max_step:
                path = self.paths.data
                # print(path)
                tts_train_set, tts_val_set = get_tts_datasets(
                    path=self.paths.data,
                    batch_size=bs,
                    r=1,
                    model_type='forward')

                asr_train_set = asr_trainer.get_train_dataloader()
                asr_test_set = asr_trainer.get_test_dataloader(asr_test_set)
                asr_pr = Wav2Vec2Processor.from_pretrained(
                    './asr_output/pretrained_processor')

                tts_session = ForwardSession(
                    path,
                    index=i,
                    r=1,
                    lr=lr,
                    max_step=max_step,
                    bs=bs,
                    train_set=tts_train_set,
                    val_set=tts_val_set,
                )
                asr_session = ASRSession(asr_pr,
                                         index=i,
                                         r=1,
                                         lr=lr,
                                         max_step=max_step,
                                         bs=4,
                                         train_set=asr_train_set,
                                         test_set=asr_test_set)
                self.train_session(model_tts, model_asr, optimizer_tts,
                                   tts_session, asr_session, asr_trainer,
                                   optimizer_asr)
Ejemplo n.º 6
0
def get_vocoder_datasets(path: Path, batch_size, train_gta):
    train_data = unpickle_binary(path / 'train_dataset.pkl')
    val_data = unpickle_binary(path / 'val_dataset.pkl')
    train_ids, train_lens = filter_max_len(train_data)
    val_ids, val_lens = filter_max_len(val_data)

    train_dataset = VocoderDataset(path, train_ids, train_gta)
    val_dataset = VocoderDataset(path, val_ids, train_gta)

    train_set = DataLoader(train_dataset,
                           collate_fn=collate_vocoder,
                           batch_size=batch_size,
                           num_workers=2,
                           shuffle=True,
                           pin_memory=True)

    val_set = DataLoader(val_dataset,
                         collate_fn=collate_vocoder,
                         batch_size=batch_size,
                         num_workers=1,
                         shuffle=False,
                         pin_memory=True)

    np.random.seed(
        42
    )  # fix numpy seed to obtain the same val set every time, I know its hacky
    val_set = [b for b in val_set]
    np.random.seed()

    val_set_samples = DataLoader(val_dataset,
                                 batch_size=1,
                                 num_workers=1,
                                 shuffle=False,
                                 pin_memory=True)

    val_set_samples = [
        s for i, s in enumerate(val_set_samples) if i < hp.voc_gen_num_samples
    ]

    return train_set, val_set, val_set_samples
Ejemplo n.º 7
0
 def __init__(self, paths: Paths) -> None:
     self.paths = paths
     self.writer = SummaryWriter(log_dir=paths.voc_log, comment='v1')
     self.loss_func = F.cross_entropy if hp.voc_mode == 'RAW' else discretized_mix_logistic_loss
     path_top_k = paths.voc_top_k/'top_k.pkl'
     if os.path.exists(path_top_k):
         self.top_k_models = unpickle_binary(path_top_k)
         # log recent top models
         for i, (mel_loss, g_wav, m_step, m_name) in enumerate(self.top_k_models, 1):
             self.writer.add_audio(
                 tag=f'Top_K_Models/generated_top_{i}',
                 snd_tensor=g_wav, global_step=m_step, sample_rate=hp.sample_rate)
     else:
         self.top_k_models = []
Ejemplo n.º 8
0
    print(f'Using {len(wav_files)} wav files that are indexed in metafile.\n')

    n_workers = max(1, args.num_workers)

    simple_table([('Sample Rate', hp.sample_rate), ('Bit Depth', hp.bits),
                  ('Mu Law', hp.mu_law), ('Hop Length', hp.hop_length),
                  ('CPU Usage', f'{n_workers}/{cpu_count()}'),
                  ('Num Validation', hp.n_val)])

    pool = Pool(processes=n_workers)
    dataset = []
    cleaned_texts = []
    preprocessor = Preprocessor(paths, text_dict)

    # Upsample punctuation
    end_idx = unpickle_binary('../../LJSpeech-1.1/ending_idx.pkl')
    bracket_idx = unpickle_binary('../../LJSpeech-1.1/bracket_idx.pkl')
    for i, (item_id, length, cleaned_text) in enumerate(
            pool.imap_unordered(preprocessor, wav_files), 1):
        if item_id in text_dict:
            if item_id in end_idx:
                for _ in range(15):
                    dataset += [(item_id, length)]
                    cleaned_texts += [(item_id, cleaned_text)]
            elif item_id in bracket_idx:
                for _ in range(7):
                    dataset += [(item_id, length)]
                    cleaned_texts += [(item_id, cleaned_text)]
            else:
                dataset += [(item_id, length)]
                cleaned_texts += [(item_id, cleaned_text)]
Ejemplo n.º 9
0
def get_tts_datasets(path: Path,
                     batch_size: int,
                     r: int,
                     max_mel_len,
                     filter_attention=True,
                     filter_min_alignment=0.5,
                     filter_min_sharpness=0.9,
                     model_type='tacotron') -> Tuple[DataLoader, DataLoader]:

    tokenizer = Tokenizer()

    train_data = unpickle_binary(path/'train_dataset.pkl')
    val_data = unpickle_binary(path/'val_dataset.pkl')
    text_dict = unpickle_binary(path/'text_dict.pkl')

    train_data = filter_max_len(train_data, max_mel_len)
    val_data = filter_max_len(val_data, max_mel_len)
    train_len_original = len(train_data)

    if model_type == 'forward' and filter_attention:
        attention_score_dict = unpickle_binary(path/'att_score_dict.pkl')
        train_data = filter_bad_attentions(dataset=train_data,
                                           attention_score_dict=attention_score_dict,
                                           min_alignment=filter_min_alignment,
                                           min_sharpness=filter_min_sharpness)
        val_data = filter_bad_attentions(dataset=val_data,
                                         attention_score_dict=attention_score_dict,
                                         min_alignment=filter_min_alignment,
                                         min_sharpness=filter_min_sharpness)
        print(f'Using {len(train_data)} train files. '
              f'Filtered {train_len_original - len(train_data)} files due to bad attention!')

    train_ids, train_lens = zip(*train_data)
    val_ids, val_lens = zip(*val_data)

    if model_type == 'tacotron':
        train_dataset = TacoDataset(path=path, dataset_ids=train_ids,
                                    text_dict=text_dict, tokenizer=tokenizer)
        val_dataset = TacoDataset(path=path, dataset_ids=val_ids,
                                  text_dict=text_dict, tokenizer=tokenizer)
    elif model_type == 'forward':
        train_dataset = ForwardDataset(path=path, dataset_ids=train_ids,
                                       text_dict=text_dict, tokenizer=tokenizer)
        val_dataset = ForwardDataset(path=path, dataset_ids=val_ids,
                                     text_dict=text_dict, tokenizer=tokenizer)
    else:
        raise ValueError(f'Unknown model: {model_type}, must be either [tacotron, forward]!')

    train_sampler = BinnedLengthSampler(train_lens, batch_size, batch_size * 3)

    train_set = DataLoader(train_dataset,
                           collate_fn=lambda batch: collate_tts(batch, r),
                           batch_size=batch_size,
                           sampler=train_sampler,
                           num_workers=0,
                           pin_memory=True)

    val_set = DataLoader(val_dataset,
                         collate_fn=lambda batch: collate_tts(batch, r),
                         batch_size=batch_size,
                         sampler=None,
                         num_workers=0,
                         shuffle=False,
                         pin_memory=True)

    return train_set, val_set