Ejemplo n.º 1
0
def main(url: str, debug: bool = True) -> None:
    """ 
        Usage case

        bool debug: print debug info
    """
    title, paragraphs = fetch.parse_webpage(url)
    stopwords = nltk.corpus.stopwords.words('english')
    cleaned = text.clean_text(paragraphs, stopwords)
    sent_tok = nltk.sent_tokenize(paragraphs)
    unique = list(set(cleaned))

    # text.generate_wordcloud(' '.join(cleaned), stopwords)

    model, db, clusters, word_clusters, n_clusters, n_noise = models.word2vec_model(
        unique, min_count=1, window=5, verbose=True)
    notes = text.create_notes(paragraphs, word_clusters)

    if debug:
        print('-' * 40)
        print(clusters.shape, len(unique))
        print(*word_clusters, sep='\n')
        print('-' * 40)
        print(notes)
        print('-' * 40)
        print(
            f"{round(100 * (1 - (len(notes) / len(paragraphs))), 3)}% reduction in size from the original"
        )
Ejemplo n.º 2
0
def process_wav(path: Path):
    wav_id = path.stem
    m, x = convert_file(path)
    np.save(paths.mel / f'{wav_id}.npy', m, allow_pickle=False)
    np.save(paths.quant / f'{wav_id}.npy', x, allow_pickle=False)
    text = text_dict[wav_id]
    text = clean_text(text)
    return wav_id, m.shape[-1], text
Ejemplo n.º 3
0
 def __call__(self, path: Path) -> Tuple[str, int, str]:
     wav_id = path.stem
     m, x = self._convert_file(path)
     np.save(self.paths.mel / f'{wav_id}.npy', m, allow_pickle=False)
     np.save(self.paths.quant / f'{wav_id}.npy', x, allow_pickle=False)
     text = self.text_dict[wav_id]
     text = clean_text(text)
     return wav_id, m.shape[-1], text
Ejemplo n.º 4
0
def synthesize(input_text, tts_model, voc_model, alpha=1.0, device=torch.device('cuda')):
    text = clean_text(input_text.strip())
    x = text_to_sequence(text)
    _, m, _ = tts_model.generate(x, alpha=alpha)
    if voc_model == 'griffinlim':
        wav = reconstruct_waveform(m, n_iter=32)
    elif isinstance(voc_model, WaveRNN):
        m = torch.tensor(m).unsqueeze(0)
        wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target, hp.voc_overlap, hp.mu_law)
    else:
        m = torch.tensor(m).unsqueeze(0).to(device)
        with torch.no_grad():
            wav = voc_model.inference(m).cpu().numpy()
    return wav
Ejemplo n.º 5
0
def synthesize(input_text, tts_model, voc_model, alpha=1.0):
    text = clean_text(input_text.strip())
    x = text_to_sequence(text)
    _, m, _ = tts_model.generate(x, alpha=alpha)
    # Fix mel spectrogram scaling to be from 0 to 1
    m = (m + 4) / 8
    np.clip(m, 0, 1, out=m)
    if voc_model == 'griffinlim':
        wav = reconstruct_waveform(m, n_iter=32)
    else:
        m = torch.tensor(m).unsqueeze(0)
        wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target,
                                 hp.voc_overlap, hp.mu_law)
        print()
    return wav
Ejemplo n.º 6
0
def synthesize(input_text: str,
               tts_model: ForwardTacotron,
               voc_model: torch.nn.Module,
               alpha=1.0,
               pitch_function: Callable[[torch.tensor],
                                        torch.tensor] = lambda x: x):
    text = clean_text(input_text.strip())
    x = text_to_sequence(text)
    _, m, _, _ = tts_model.generate(x,
                                    alpha=alpha,
                                    pitch_function=pitch_function)
    if voc_model == 'griffinlim':
        wav = reconstruct_waveform(m, n_iter=32)
    elif isinstance(voc_model, WaveRNN):
        m = torch.tensor(m).unsqueeze(0)
        wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target,
                                 hp.voc_overlap, hp.mu_law)
    else:
        m = torch.tensor(m).unsqueeze(0).cuda()
        with torch.no_grad():
            wav = voc_model.inference(m).cpu().numpy()
    return wav
Ejemplo n.º 7
0
        pitch_emb_dims=hp.forward_pitch_emb_dims,
        pitch_proj_dropout=hp.forward_pitch_proj_dropout,
        rnn_dim=hp.forward_rnn_dims,
        postnet_k=hp.forward_postnet_K,
        postnet_dims=hp.forward_postnet_dims,
        prenet_k=hp.forward_prenet_K,
        prenet_dims=hp.forward_prenet_dims,
        highways=hp.forward_num_highways,
        dropout=hp.forward_dropout,
        n_mels=hp.num_mels).to(device)

    tts_load_path = tts_weights if tts_weights else paths.forward_latest_weights
    tts_model.load(tts_load_path)

    if input_text:
        text = clean_text(input_text.strip())
        inputs = [text_to_sequence(text)]
    else:
        with open('sentences.txt') as f:
            inputs = [clean_text(l.strip()) for l in f]
        inputs = [text_to_sequence(t) for t in inputs]

    tts_k = tts_model.get_step() // 1000

    if args.vocoder == 'wavernn':
        voc_k = voc_model.get_step() // 1000
        simple_table([('Forward Tacotron', str(tts_k) + 'k'),
                      ('Vocoder Type', 'WaveRNN'),
                      ('WaveRNN', str(voc_k) + 'k'),
                      ('Generation Mode',
                       'Batched' if batched else 'Unbatched'),
Ejemplo n.º 8
0
    def dual_transform(self, model_tts, model_asr, optimizer_tts,
                       optimizer_asr, asr_test_set, m_loss_avg, dur_loss_avg,
                       device, asr_current_step, e, epochs, duration_avg,
                       total_iters, tts_s_loss, asr_s_loss, tts_lr,
                       tts_dt_path):
        print('\n\nStarting DualTransformation loop...\n')
        # exit()
        tmp_dir = './checkpoints/sme_speech_tts.asr_forward/dual_transform_tmp'
        os.makedirs(tmp_dir, exist_ok=True)
        # generate tmp ASR training data
        asr_train_data = []
        input_set = get_unpaired_txt(35)
        # print(input_set)
        text = [clean_text(v) for v in input_set]
        inputs = [text_to_sequence(t) for t in text]

        # generate unpaired data for ASR from TTS
        for i, x in enumerate(inputs, 1):
            _, m, dur = model_tts.generate(x, alpha=1.)
            wav = reconstruct_waveform(m, n_iter=32)
            wav_path = os.path.join(tmp_dir, f'{i}.wav')
            save_wav(wav, wav_path)
            asr_train_data.append((wav_path, text[i - 1]))

        # print(asr_train_data)
        dt_asr_data = load_dt_data(asr_train_data)
        # reinit trainer with only tmp train data
        asr_trainer_dt = init_trainer(dt_asr_data, None)
        dt_train = asr_trainer_dt.get_train_dataloader()

        # unsuper train loop for ASR
        for step, inputs in enumerate(dt_train, 1):
            # model_asr.cpu()
            model_asr.train()
            model_asr.to(device)
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)
            # model_asr.train()
            outputs = model_asr(**inputs)
            asr_u_loss = outputs["loss"] if isinstance(outputs,
                                                       dict) else outputs[0]
            # asr_u_loss.detach()
            # asr_u_loss = asr_s_loss.mean()

            # model_name = step + asr_current_step
            msg_asr =   f'| ASR MODEL (unsupervised training) : '\
                    f'| Epoch: {e}/{epochs} ({step}/{len(dt_train)}) | Loss ASR: {asr_u_loss:#.4} '\
                    f' ||||||||||||||||||||||||||||||||||||||||||||||||'
            stream(msg_asr)

        # for f in os.listdir(tmp_dir):
        #     file_path = os.path.join(tmp_dir, f)
        #     if f.endswith('.wav'):
        #         os.unlink(file_path)

        # generate tmp TTS data from ASR
        # model_asr.to(device)
        asr_predict_for_dt(model_asr)

        subprocess.check_output(
            'python preprocess.py -p "./data/speech-sme-tts" -d=True',
            shell=True,
            stderr=subprocess.STDOUT)
        print('Finished preprocessing for tmp data!')

        tmp_tts_train = get_tts_datasets(tts_dt_path,
                                         batch_size=2,
                                         r=1,
                                         model_type='forward_dt')
        print("Loaded tmp dataset!")
        # unsuper TTS training

        for i, (x, m, ids, x_lens, mel_lens,
                dur) in enumerate(tmp_tts_train, 1):
            start = time.time()
            model_tts.to(device)
            model_tts.train()
            # optimizer_tts.zero_grad()
            x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\
                                                 x_lens.to(device), mel_lens.to(device)

            m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens)

            m1_loss = self.l1_loss(m1_hat, m, mel_lens)
            m2_loss = self.l1_loss(m2_hat, m, mel_lens)

            dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1),
                                    x_lens)

            tts_u_loss = m1_loss + m2_loss + 0.1 * dur_loss
            # optimizer_tts.zero_grad()
            # tts_u_loss.backward()
            torch.nn.utils.clip_grad_norm_(model_tts.parameters(),
                                           hp.tts_clip_grad_norm)
            # optimizer_tts.step()
            m_loss_avg.add(m1_loss.item() + m2_loss.item())
            dur_loss_avg.add(dur_loss.item())
            step = model_tts.get_step()
            k = step // 1000

            duration_avg.add(time.time() - start)
            # pitch_loss_avg.add(pitch_loss.item())

            speed = 1. / duration_avg.get()
            msg_tts = f'| TTS MODEL (unsupervised training ): '\
                  f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                  f'| Dur Loss: {dur_loss_avg.get():#.4} ' \
                  f'| {speed:#.2} steps/s | Step: {k}k | '

            stream(msg_tts)
        # m_val_loss, dur_val_loss = self.evaluate(model_tts, tts_session.val_set)
        #TODO: combine L and update
        # asr_s_loss = torch.tensor(asr_s_loss).to(device)
        combined_loss = 0.5 * (tts_s_loss + asr_s_loss) + (tts_u_loss +
                                                           asr_u_loss)
        # backwards
        combined_loss.to(device)
        # print(combined_loss)
        combined_loss.backward()
        optimizer_tts.step()

        for state in optimizer_asr.state.values():
            for k, v in state.items():
                if torch.is_tensor(v):
                    state[k] = v.to(device)

        optimizer_asr.step()

        m_loss_avg.reset()
        duration_avg.reset()
        # pitch_loss_avg.reset()
        dt_msg = f'\n\nFinished DT loop in epoch {e}!\n'
        stream(dt_msg)
        print(' ')
        return tts_u_loss, asr_u_loss