def main(url: str, debug: bool = True) -> None: """ Usage case bool debug: print debug info """ title, paragraphs = fetch.parse_webpage(url) stopwords = nltk.corpus.stopwords.words('english') cleaned = text.clean_text(paragraphs, stopwords) sent_tok = nltk.sent_tokenize(paragraphs) unique = list(set(cleaned)) # text.generate_wordcloud(' '.join(cleaned), stopwords) model, db, clusters, word_clusters, n_clusters, n_noise = models.word2vec_model( unique, min_count=1, window=5, verbose=True) notes = text.create_notes(paragraphs, word_clusters) if debug: print('-' * 40) print(clusters.shape, len(unique)) print(*word_clusters, sep='\n') print('-' * 40) print(notes) print('-' * 40) print( f"{round(100 * (1 - (len(notes) / len(paragraphs))), 3)}% reduction in size from the original" )
def process_wav(path: Path): wav_id = path.stem m, x = convert_file(path) np.save(paths.mel / f'{wav_id}.npy', m, allow_pickle=False) np.save(paths.quant / f'{wav_id}.npy', x, allow_pickle=False) text = text_dict[wav_id] text = clean_text(text) return wav_id, m.shape[-1], text
def __call__(self, path: Path) -> Tuple[str, int, str]: wav_id = path.stem m, x = self._convert_file(path) np.save(self.paths.mel / f'{wav_id}.npy', m, allow_pickle=False) np.save(self.paths.quant / f'{wav_id}.npy', x, allow_pickle=False) text = self.text_dict[wav_id] text = clean_text(text) return wav_id, m.shape[-1], text
def synthesize(input_text, tts_model, voc_model, alpha=1.0, device=torch.device('cuda')): text = clean_text(input_text.strip()) x = text_to_sequence(text) _, m, _ = tts_model.generate(x, alpha=alpha) if voc_model == 'griffinlim': wav = reconstruct_waveform(m, n_iter=32) elif isinstance(voc_model, WaveRNN): m = torch.tensor(m).unsqueeze(0) wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target, hp.voc_overlap, hp.mu_law) else: m = torch.tensor(m).unsqueeze(0).to(device) with torch.no_grad(): wav = voc_model.inference(m).cpu().numpy() return wav
def synthesize(input_text, tts_model, voc_model, alpha=1.0): text = clean_text(input_text.strip()) x = text_to_sequence(text) _, m, _ = tts_model.generate(x, alpha=alpha) # Fix mel spectrogram scaling to be from 0 to 1 m = (m + 4) / 8 np.clip(m, 0, 1, out=m) if voc_model == 'griffinlim': wav = reconstruct_waveform(m, n_iter=32) else: m = torch.tensor(m).unsqueeze(0) wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target, hp.voc_overlap, hp.mu_law) print() return wav
def synthesize(input_text: str, tts_model: ForwardTacotron, voc_model: torch.nn.Module, alpha=1.0, pitch_function: Callable[[torch.tensor], torch.tensor] = lambda x: x): text = clean_text(input_text.strip()) x = text_to_sequence(text) _, m, _, _ = tts_model.generate(x, alpha=alpha, pitch_function=pitch_function) if voc_model == 'griffinlim': wav = reconstruct_waveform(m, n_iter=32) elif isinstance(voc_model, WaveRNN): m = torch.tensor(m).unsqueeze(0) wav = voc_model.generate(m, '/tmp/sample.wav', True, hp.voc_target, hp.voc_overlap, hp.mu_law) else: m = torch.tensor(m).unsqueeze(0).cuda() with torch.no_grad(): wav = voc_model.inference(m).cpu().numpy() return wav
pitch_emb_dims=hp.forward_pitch_emb_dims, pitch_proj_dropout=hp.forward_pitch_proj_dropout, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) tts_load_path = tts_weights if tts_weights else paths.forward_latest_weights tts_model.load(tts_load_path) if input_text: text = clean_text(input_text.strip()) inputs = [text_to_sequence(text)] else: with open('sentences.txt') as f: inputs = [clean_text(l.strip()) for l in f] inputs = [text_to_sequence(t) for t in inputs] tts_k = tts_model.get_step() // 1000 if args.vocoder == 'wavernn': voc_k = voc_model.get_step() // 1000 simple_table([('Forward Tacotron', str(tts_k) + 'k'), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'),
def dual_transform(self, model_tts, model_asr, optimizer_tts, optimizer_asr, asr_test_set, m_loss_avg, dur_loss_avg, device, asr_current_step, e, epochs, duration_avg, total_iters, tts_s_loss, asr_s_loss, tts_lr, tts_dt_path): print('\n\nStarting DualTransformation loop...\n') # exit() tmp_dir = './checkpoints/sme_speech_tts.asr_forward/dual_transform_tmp' os.makedirs(tmp_dir, exist_ok=True) # generate tmp ASR training data asr_train_data = [] input_set = get_unpaired_txt(35) # print(input_set) text = [clean_text(v) for v in input_set] inputs = [text_to_sequence(t) for t in text] # generate unpaired data for ASR from TTS for i, x in enumerate(inputs, 1): _, m, dur = model_tts.generate(x, alpha=1.) wav = reconstruct_waveform(m, n_iter=32) wav_path = os.path.join(tmp_dir, f'{i}.wav') save_wav(wav, wav_path) asr_train_data.append((wav_path, text[i - 1])) # print(asr_train_data) dt_asr_data = load_dt_data(asr_train_data) # reinit trainer with only tmp train data asr_trainer_dt = init_trainer(dt_asr_data, None) dt_train = asr_trainer_dt.get_train_dataloader() # unsuper train loop for ASR for step, inputs in enumerate(dt_train, 1): # model_asr.cpu() model_asr.train() model_asr.to(device) for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) # model_asr.train() outputs = model_asr(**inputs) asr_u_loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] # asr_u_loss.detach() # asr_u_loss = asr_s_loss.mean() # model_name = step + asr_current_step msg_asr = f'| ASR MODEL (unsupervised training) : '\ f'| Epoch: {e}/{epochs} ({step}/{len(dt_train)}) | Loss ASR: {asr_u_loss:#.4} '\ f' ||||||||||||||||||||||||||||||||||||||||||||||||' stream(msg_asr) # for f in os.listdir(tmp_dir): # file_path = os.path.join(tmp_dir, f) # if f.endswith('.wav'): # os.unlink(file_path) # generate tmp TTS data from ASR # model_asr.to(device) asr_predict_for_dt(model_asr) subprocess.check_output( 'python preprocess.py -p "./data/speech-sme-tts" -d=True', shell=True, stderr=subprocess.STDOUT) print('Finished preprocessing for tmp data!') tmp_tts_train = get_tts_datasets(tts_dt_path, batch_size=2, r=1, model_type='forward_dt') print("Loaded tmp dataset!") # unsuper TTS training for i, (x, m, ids, x_lens, mel_lens, dur) in enumerate(tmp_tts_train, 1): start = time.time() model_tts.to(device) model_tts.train() # optimizer_tts.zero_grad() x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\ x_lens.to(device), mel_lens.to(device) m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) tts_u_loss = m1_loss + m2_loss + 0.1 * dur_loss # optimizer_tts.zero_grad() # tts_u_loss.backward() torch.nn.utils.clip_grad_norm_(model_tts.parameters(), hp.tts_clip_grad_norm) # optimizer_tts.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model_tts.get_step() k = step // 1000 duration_avg.add(time.time() - start) # pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg_tts = f'| TTS MODEL (unsupervised training ): '\ f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' stream(msg_tts) # m_val_loss, dur_val_loss = self.evaluate(model_tts, tts_session.val_set) #TODO: combine L and update # asr_s_loss = torch.tensor(asr_s_loss).to(device) combined_loss = 0.5 * (tts_s_loss + asr_s_loss) + (tts_u_loss + asr_u_loss) # backwards combined_loss.to(device) # print(combined_loss) combined_loss.backward() optimizer_tts.step() for state in optimizer_asr.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(device) optimizer_asr.step() m_loss_avg.reset() duration_avg.reset() # pitch_loss_avg.reset() dt_msg = f'\n\nFinished DT loop in epoch {e}!\n' stream(dt_msg) print(' ') return tts_u_loss, asr_u_loss