コード例 #1
0
 def synthesize(self, text, mel_targets=None, reference_mel=None):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
     }
     if mel_targets is not None:
         mel_targets = np.expand_dims(mel_targets, 0)
         feed_dict.update({
             self.model.mel_targets:
             np.asarray(mel_targets, dtype=np.float32)
         })
     if reference_mel is not None:
         reference_mel = np.expand_dims(reference_mel, 0)
         feed_dict.update({
             self.model.reference_mel:
             np.asarray(reference_mel, dtype=np.float32)
         })
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
コード例 #2
0
  def synthesize(self, text):
    cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
    print ('***cleaner_names:', cleaner_names)
    print ('***text:', text)
    texts = tokenizer.tokenize(text)
    waves=[]

    for text in texts:
      seq = text_to_sequence(text, cleaner_names)
      print ('***seq:', seq)

      feed_dict = {
        self.model.inputs: [np.asarray(seq, dtype=np.int32)],
        self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
      }
      wav = self.session.run(self.wav_output, feed_dict=feed_dict)
      wav = audio.inv_preemphasis(wav)
      wav = wav[:audio.find_endpoint(wav)]
      waves.append(wav)
    wavestack=waves[0]
    for wave in waves[1:]:
      wavestack=np.hstack((wavestack,wave))  
    out = io.BytesIO()
    audio.save_wav(wavestack, out)
    return out.getvalue()
コード例 #3
0
    def synthesize(self,
                   text,
                   mel_targets=None,
                   reference_mel=None,
                   reference_weight=None,
                   alignment_path=None,
                   reference_path=None,
                   style_path=None,
                   weight_path=None):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
        }
        if mel_targets is not None:
            mel_targets = np.expand_dims(mel_targets, 0)
            feed_dict.update({
                self.model.mel_targets:
                np.asarray(mel_targets, dtype=np.float32)
            })
        elif reference_mel is not None:
            reference_mel = np.expand_dims(reference_mel, 0)
            feed_dict.update({
                self.model.reference_mel:
                np.asarray(reference_mel, dtype=np.float32)
            })
        elif reference_weight is not None:
            feed_dict.update({
                self.model.reference_weight:
                np.asarray(reference_weight, dtype=np.float32)
            })

        wav, alignments, style_embeddings, style_weights = self.session.run(
            [
                self.wav_output, self.alignments, self.style_embeddings,
                self.style_weights
            ],
            feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        end_point = audio.find_endpoint(wav)
        wav = wav[:end_point]
        out = io.BytesIO()
        audio.save_wav(wav, out)
        n_frame = int(
            end_point /
            (hparams.frame_shift_ms / 1000 * hparams.sample_rate)) + 1
        text = '\n'.join(textwrap.wrap(text, 70, break_long_words=False))
        plot.plot_alignment(alignments[:, :n_frame],
                            alignment_path,
                            info='%s' % (text))
        plot.plot_weight(style_weights, weight_path)
        # np.save(reference_path, refer_embeddings)
        np.save(style_path, style_embeddings)
        return out.getvalue()
コード例 #4
0
ファイル: synthesizer.py プロジェクト: DorsetProject/tacotron
 def synthesize(self, text):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     out = io.BytesIO()
     audio.save_wav(audio.inv_preemphasis(wav), out)
     return out.getvalue()
コード例 #5
0
 def synthesize(self, text, out):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     audio.save_wav(wav, out)
     return
コード例 #6
0
ファイル: synthesizer.py プロジェクト: keithito/tacotron
 def synthesize(self, text):
   cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
   seq = text_to_sequence(text, cleaner_names)
   feed_dict = {
     self.model.inputs: [np.asarray(seq, dtype=np.int32)],
     self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
   }
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   wav = wav[:audio.find_endpoint(wav)]
   out = io.BytesIO()
   audio.save_wav(wav, out)
   return out.getvalue()
コード例 #7
0
 def synthesize(self, in_file):
   src_spectrogram = audio.spectrogram(in_file,
                                       num_src_freq=hparams.num_src_freq,
                                       frame_length_ms=hparams.src_frame_length_ms).astype(np.float32)
   feed_dict = {
     self.model.inputs: [np.asarray(src_spectrogram, dtype=np.float32)],
     self.model.input_lengths: np.asarray([len(src_spectrogram)], dtype=np.int32)
   }
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   wav = wav[:audio.find_endpoint(wav)]
   out = io.BytesIO()
   audio.save_wav(wav, out)
   return out.getvalue()
コード例 #8
0
ファイル: eval.py プロジェクト: Bruce-Stark/SV-Tacotron
 def synthesize(self, text):
   # 将中文转换为注音字符
   text = Pinyin().get_pinyin(text, " ", tone_marks='numbers')
   cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
   # 注音字符到序列
   seq = text_to_sequence(text, cleaner_names)
   feed_dict = {
     self.model.inputs: [np.asarray(seq, dtype=np.int32)],
     self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)}
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   out = io.BytesIO()
   audio.save_wav(wav, out)
   return out.getvalue()
コード例 #9
0
    def synthesize(self, input_path):
        s, sr = sf.read(input_path)
        spec = audio.melspectrogram(s).astype(np.float32).T

        feed_dict = {
            self.model.inputs: [np.asarray(spec, dtype=np.float32)],
            self.model.input_lengths: np.asarray([spec.shape[0]],
                                                 dtype=np.int32)
        }
        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()
        audio.save_wav(wav, out)
        return out.getvalue()
コード例 #10
0
    def synthesize(self, text):  # for demo_server
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
        }
        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav)
        wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()

        audio.save_wav(wav, out)
        # print(type(out))
        return out.getvalue()  # returns bytes obj
コード例 #11
0
 def synthesize(self, text):
   text = arpa.to_arpa(text)
   cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
   seq = text_to_sequence(text, cleaner_names)
   feed_dict = {
     self.model.inputs: [np.asarray(seq, dtype=np.int32)],
     self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
   }
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   wav = wav[:audio.find_endpoint(wav)]
   out = io.BytesIO()
   #audio.save_wav(wav, out)
   audio.save_wav(wav, "/content/drive/MyDrive/voice_cloning/out_sample")
   print("finishhhhhhhhhhhhhhh")
   return out.getvalue()
コード例 #12
0
ファイル: estimator.py プロジェクト: meelement/tacotron-2
def main(args):
    os.makedirs(args.model_dir, exist_ok=True)
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=args.model_dir,
        params=hparams,
        config=RunConfig(
            save_summary_steps=args.summary_interval,
            save_checkpoints_steps=args.checkpoint_interval,
            session_config=SESS_CFG,
            # log_step_count_steps=100,
            keep_checkpoint_max=2))
    if args.mode == 'train':
        os.makedirs(args.data_dir, exist_ok=True)
        estimator.train(input_fn=lambda: train_input_fn(args.data_dir))
    elif args.mode == 'predict':
        assert len(args.texts), "No text to predict"
        results = estimator.predict(
            input_fn=lambda: predict_input_fn(args.texts))
        for idx, wav in enumerate(results):
            wav = inv_preemphasis(wav)
            # wav = wav[:find_endpoint(wav)]
            # sp.save('wav_{}.npy'.format(idx), wav, allow_pickle=False)
            save_wav(wav, 'output_{}.wav'.format(idx))
            # break
    elif args.mode == 'export':
        os.makedirs(args.export_dir, exist_ok=True)
        estimator.export_saved_model(
            args.export_dir,
            tf.estimator.export.build_raw_serving_input_receiver_fn(
                {
                    'inputs':
                    tf.placeholder(
                        dtype=tf.int32, shape=(None, None), name='inputs'),
                    'lengths':
                    tf.placeholder(
                        dtype=tf.int32, shape=(None, ), name='lengths'),
                },
                default_batch_size=None),
            # assets_extra=None,
            # as_text=False,
            # checkpoint_path=None,
            # experimental_mode=ModeKeys.PREDICT
        )
    else:
        raise KeyError('Unknown Mode <{}>'.format(args.mode))
コード例 #13
0
ファイル: eval.py プロジェクト: back2zion/Tacotron-Korean
 def synthesize(self, text, base_path, idx):
     seq = text_to_sequence(text)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     input_seq, wav, alignment = self.session.run(
         [self.inputs, self.wav_output, self.alignments],
         feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     input_seq = sequence_to_text(input_seq)
     plot.plot_alignment(alignment, '%s-%d-align.png' % (base_path, idx),
                         input_seq)
     return out.getvalue()
コード例 #14
0
    def synthesize(self,
                   path_in,
                   path_re,
                   mel_targets=None,
                   reference_mel=None,
                   alignment_path=None):
        wav_in = audio.load_wav(path_in)
        wav_re = audio.load_wav(path_re)
        mel_in = audio.melspectrogram(wav_in).astype(np.float32)
        mel_re = audio.melspectrogram(wav_re).astype(np.float32)
        # print(mel_jp)
        feed_dict = {
            self.model.inputs: [mel_in.T],
            self.model.input_lengths: np.asarray([len(mel_in)],
                                                 dtype=np.int32),
            self.model.inputs_jp: [mel_re.T],
        }
        # if mel_targets is not None:
        #   mel_targets = np.expand_dims(mel_targets, 0)
        #   print(reference_mel.shapex)
        #   feed_dict.update({self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32)})
        # if reference_mel is not None:
        #   reference_mel = np.expand_dims(reference_mel, 0)
        #   print(reference_mel.shapex)
        #   feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)})

        wav_out, alignments = self.session.run(
            [self.wav_output, self.alignments], feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav_out)
        end_point = audio.find_endpoint(wav)
        wav = wav[:end_point]
        nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")  # 生成当前时间
        randomNum = random.randint(0, 100)  # 生成的随机整数n,其中0<=n<=100
        if randomNum <= 10:
            randomNum = str(0) + str(randomNum)
        uniqueNum = str(nowTime) + str(randomNum)
        out_dir = "static\\out\\" + uniqueNum + ".wav"
        out_name = uniqueNum + ".wav"

        audio.save_wav(wav, out_dir)
        out = io.BytesIO()
        audio.save_wav(wav, out)
        # n_frame = int(end_point / (hparams.frame_shift_ms / 1000* hparams.sample_rate)) + 1
        # plot.plot_alignment(alignments[:,:n_frame], alignment_path, info='%s' % (path))
        return out_dir, out_name
コード例 #15
0
 def synthesize(self, text):
     #print('synthesize:',text)
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     #text=sentence_to_pinyin(text)
     #print('text:',text)
     #print('cleaner_names:',cleaner_names)
     seq = text_to_sequence_zh(text, cleaner_names)
     print(seq)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
コード例 #16
0
 def synthesize(self, text, title):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     feed_dict = {
         self.model.inputs: [np.asarray(seq, dtype=np.int32)],
         self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     cwd = os.getcwd()
     audio_dir = cwd + "/narration/saved_audio/" + title + ".wav"
     print(audio_dir)
     with open(audio_dir, "wb") as f:
         f.write(out.getvalue())
     os.system("aplay " + audio_dir)
     return out.getvalue()
コード例 #17
0
 def synthesize(self, text):
     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
     # g2p = G2p()
     c_text=text.split('|')[0]
     p_text=text.split('|')[1]
     c_seq = text_to_sequence(c_text, cleaner_names)
     p_seq = text_to_sequence(p_text, cleaner_names)
     feed_dict = {
         self.model.c_inputs: [np.asarray(c_seq, dtype=np.int32)],
         self.model.p_inputs: [np.asarray(p_seq, dtype=np.int32)],
         self.model.c_input_lengths: np.asarray([len(c_seq)], dtype=np.int32),
         self.model.p_input_lengths: np.asarray([len(p_seq)], dtype=np.int32)
     }
     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     wav = wav[:audio.find_endpoint(wav)]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue()
コード例 #18
0
ファイル: synthesizer.py プロジェクト: bayalievu/tacotron
 def synthesize(self, text):
     with Synthesizer.mutex:
         if not Synthesizer.processing:
             Synthesizer.processing = True
             cleaner_names = [
                 x.strip() for x in hparams.cleaners.split(',')
             ]
             seq = text_to_sequence(text, cleaner_names)
             feed_dict = {
                 self.model.inputs: [np.asarray(seq, dtype=np.int32)],
                 self.model.input_lengths:
                 np.asarray([len(seq)], dtype=np.int32)
             }
             wav = self.session.run(self.wav_output, feed_dict=feed_dict)
             wav = audio.inv_preemphasis(wav)
             wav = wav[:audio.find_endpoint(wav)]
             out = io.BytesIO()
             audio.save_wav(wav, out)
             Synthesizer.processing = False
             return out.getvalue()
         else:
             return None
コード例 #19
0
ファイル: synthesizer.py プロジェクト: param17/Songbird
    def synthesize(self, images_dir, output_wav_dir):
        for path, _, filenames in os.walk(images_dir):
            for i in trange(len(filenames)):
                test_file = filenames[i]
                if str.endswith(test_file, '.png'):
                    base_file_name, _ = os.path.splitext(test_file)
                    raw_image = imread(os.path.join(path, test_file),
                                       mode='RGB')
                    processed_image = imresize(raw_image, (224, 224, 3))

                    feed_dict = {
                        self.model.inputs:
                        [np.asarray(processed_image, dtype=np.float32)],
                    }
                    wav = self.session.run(self.wav_output,
                                           feed_dict=feed_dict)
                    wav = audio.inv_preemphasis(wav)
                    wav = wav[:audio.find_endpoint(wav)]
                    audio_out_path = os.path.join(
                        output_wav_dir, 'eval-{}.wav'.format(base_file_name))
                    audio.save_wav(wav, audio_out_path)
                    print('Wav - {} generated successfully!'.format(
                        audio_out_path))
コード例 #20
0
 def synthesize(self, lab_name):
     lab = np.load(lab_name)
     lab = np.expand_dims(lab, axis=0)
     feed_dict = {
         self.model.inputs:
         lab,
         self.model.input_lengths:
         np.asarray([lab.shape[1]], dtype=np.int32),
         # change 0 to 1 or others based on the speaker
         self.model.speaker_ids:
         np.asarray([2], dtype=np.int32)
     }
     wav, mel_outputs = self.session.run(
         [self.wav_output, self.model.mel_outputs[0]], feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     _len = audio.find_endpoint(wav)
     wav = wav[:_len]
     _len = audio.find_endpoint(wav)
     wav = wav[:_len]
     mel_output = mel_output[:frames, :]
     out = io.BytesIO()
     audio.save_wav(wav, out)
     return out.getvalue(), mel_outputs
コード例 #21
0
    def synthesize(self, text, identity, path=None, path_align=None):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence2(text, cleaner_names)[:-1]
        print(seq)
        print(sequence_to_text2(seq))
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
            self.model.identities: np.asarray([identity], dtype=np.int32),
        }
        wav, alignment = self.session.run([self.wav_output, self.alignment],
                                          feed_dict=feed_dict)
        if path_align is not None:
            plot.plot_alignment(alignment, path_align)
        wav = audio.inv_preemphasis(wav)
        #wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()
        if path is not None:
            audio.save_wav(wav, path)
        else:
            audio.save_wav(wav, './1.wav')

        return out.getvalue()
コード例 #22
0
ファイル: synthesizer.py プロジェクト: Jim-Song/n_tacotron
 def synthesize(self, text, mel_spec):
     cleaner_names = [x.strip() for x in self.hparams.cleaners.split(',')]
     seq = text_to_sequence(text, cleaner_names)
     if self.hparams.enable_fv1 or self.hparams.enable_fv2:
         feed_dict = {
             self.model.inputs: [np.asarray(seq, dtype=np.int32)],
             self.model.input_lengths: np.asarray([len(seq)],
                                                  dtype=np.int32),
             self.net.data2: mel_spec
         }
     else:
         feed_dict = {
             self.model.inputs: [np.asarray(seq, dtype=np.int32)],
             self.model.input_lengths: np.asarray([len(seq)],
                                                  dtype=np.int32),
         }
     wav, alignment = self.session.run(
         [self.wav_output, self.model.alignments], feed_dict=feed_dict)
     wav = audio.inv_preemphasis(wav)
     alignment = alignment[0]
     #wav = wav[:audio.find_endpoint(wav)]
     #out = io.BytesIO()
     #audio.save_wav(wav, out)
     return wav, alignment