Esempio n. 1
0
def build_from_path(input_dir, out_dir, n_jobs=4, tqdm=lambda x: x):
    """
    Preprocesses the Lj speech dataset from a gven input path to a given output directory

    Args:
        - in_dir: input directory that contains the files to prerocess
        - out_dir: output drectory of the preprocessed Lj dataset
        - n_jobs: Optional, number of worker process to parallelize across
        - tqdm: Optional, provides a nice progress bar

    Returns:
        - A list of tuple describing the train examples. this should be written to train.txt
    """

    # We use ProcessPoolExecutor to parallelize across processes, this is just for
    # optimization purposes and it can be omited
    executor = ProcessPoolExecutor(max_workers=n_jobs)
    futures = []
    index = 1
    with open(os.path.join(input_dir, 'wavs.txt'), encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('<------>')
            wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(parts[0]))
            text1 = parts[1]
            text = ch2p(text1)
            print(str(text1) + "====>" + str(text))
            futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text)))
            index += 1
    return [future.result() for future in tqdm(futures)]
Esempio n. 2
0
    def tts_synthesize(self, get_txt, res):
        txt = process_txt(get_txt)
        split_sentence = txt.split(',')
        result = b''#AudioSegment.silent(duration=50)

        import datetime
        start_time = datetime.datetime.now()
        app_logger.info('synthesizing ...')

        inputs = []
        for x in split_sentence:
            inputs.append(ch2p(x))
            app_logger.info('to pinyin: ' + str([len(j) for j in inputs]) + ' ' + str(inputs))
        out = synthesizer.synthesize(inputs)
        for wav in out:
            _, wav = vad_check_wav(wav_path=wav)
            result += wav

        uuid_str = str(uuid.uuid1()).replace('-', '')
        tmp_fn = os.path.realpath('./tmp/%s.wav' % uuid_str)
        tmp_path = os.path.dirname(tmp_fn)
        app_logger.info('tmp_path: ' + str(tmp_path))

        #result.export(tmp_fn, format='wav')
        self.write_wave_vad(wav_path=tmp_fn, audio=result, sample_rate=16000)

        app_logger.info('self.vol: ' + str(self.vol))

        total_len = sum([len(j) for j in split_sentence])
        app_logger.info('total_len: ' + str(total_len))

        new_fn = self.handle_wav(tmp_fn)  # , char_len=total_len
        app_logger.info('new_fn after handle_wav: ' + str(new_fn))

        fp = open(new_fn, 'rb')
        data = fp.read()
        fp.close()

        aud = io.BytesIO(data)
        res.data = aud.read()

        end_time = datetime.datetime.now()
        d = (end_time - start_time)
        used_time_ms = d.total_seconds() * 1000
        app_logger.info('used_time_ms:' + str(used_time_ms))

        res.content_type = 'audio/wav'

        rm_wav = True
        if rm_wav:
            os.system('rm -f %s/%s*.wav' % (tmp_path, uuid_str))
Esempio n. 3
0
 def generate(self, text=None):
     text = ch2p(text)
     sequence = np.array(text_to_sequence(text,
                                          ['basic_cleaners']))[None, :]
     sequence = torch.autograd.Variable(
         torch.from_numpy(sequence)).cuda().long()
     mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
         sequence)
     taco_stft = TacotronSTFT(self.hparams.filter_length,
                              self.hparams.hop_length,
                              self.hparams.win_length,
                              sampling_rate=self.hparams.sampling_rate)
     mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
     mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
     spec_from_mel_scaling = 1000
     spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
     spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
     spec_from_mel = spec_from_mel * spec_from_mel_scaling
     waveform = griffin_lim(
         torch.autograd.Variable(spec_from_mel[:, :, :-1]),
         taco_stft.stft_fn, 60)
def build_from_path(in_dir,
                    out_dir,
                    silence_threshold,
                    fft_size,
                    num_workers=cpu_count(),
                    tqdm=lambda x: x):
    executor = ProcessPoolExecutor(max_workers=num_workers)
    futures = []
    index = 1
    with open(os.path.join(in_dir, 'wavs.txt'), encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('<------>')
            wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
            # text = parts[1]
            text1 = parts[1]
            text = ch2p(text1)
            print("%s.wav: %s ===>%s" % (parts[0], text1, text))
            futures.append(
                executor.submit(
                    partial(_process_utterance, out_dir, index, wav_path, text,
                            silence_threshold, fft_size)))
            index += 1
    return [future.result() for future in tqdm(futures)]