Beispiel #1
0
def main():
    # Target data
    filename = "120_kmeans_obj.pkl"

    kmeans = k.load_pkl(filename)
    spec, label = load_test_data()

    print("spec", spec.shape)
    print("label", label.shape)

    spec_ = np.empty((513, ), np.float32)
    for i in range(len(label)):
        spec_ = np.vstack((spec_, kmeans.cluster_centers_[label[i]]))
    spec_ = np.delete(spec_, 0, 0)

    print("compare data structure ----")
    print("spec: ", spec.shape)
    print("spec_: ", spec_.shape)

    print("spec data:", spec)
    print("spec_ data:", spec_)

    print("min-max spce_ data:", min_max(spec_))

    waveform = audio.inv_spectrogram(spec)
    waveform_ = audio.inv_spectrogram(spec_)
    waveformmm_ = audio.inv_spectrogram(min_max(spec_))

    audio.save_wav(waveform, 'ideal_out.wav')
    audio.save_wav(waveform_, 'idela_out_.wav')
    audio.save_wav(waveformmm_, 'idelal_outmm_.wav')
Beispiel #2
0
    def save_states(self, global_epoch, mel_outputs, linear_outputs, ling, mel,
                    linear, lengths):
        print("Save intermediate states at epoch {}".format(global_epoch))

        # idx = np.random.randint(0, len(input_lengths))
        idx = min(1, len(lengths) - 1)

        # Predicted mel spectrogram
        if mel_outputs is not None:
            mel_output = mel_outputs[idx].cpu().data.numpy()
            mel_output = prepare_spec_image(audio._denormalize(mel_output))
            self.writer.add_image("Predicted mel spectrogram", mel_output,
                                  global_epoch)
        # Predicted spectrogram
        if linear_outputs is not None:
            linear_output = linear_outputs[idx].cpu().data.numpy()
            spectrogram = prepare_spec_image(audio._denormalize(linear_output))
            self.writer.add_image("Predicted spectrogram", spectrogram,
                                  global_epoch)
            # Predicted audio signal
            signal = audio.inv_spectrogram(linear_output.T)
            signal /= np.max(np.abs(signal))
            path = join(self.checkpoint_dir,
                        "epoch{:09d}_predicted.wav".format(global_epoch))
            try:
                self.writer.add_audio("Predicted audio signal",
                                      signal,
                                      global_epoch,
                                      sample_rate=self.fs)
            except Exception as e:
                warn(str(e))
                pass
            audio.save_wav(signal, path)

        # Target mel spectrogram

        if mel_outputs is not None:
            #ling = ling[idx].cpu().data.numpy()
            #mel = prepare_spec_image(audio._denormalize(mel))
            #self.writer.add_image("Source mel spectrogram", ling, global_epoch)
            mel = mel[idx].cpu().data.numpy()
            mel = prepare_spec_image(audio._denormalize(mel))
            self.writer.add_image("Target mel spectrogram", mel, global_epoch)
        if linear_outputs is not None:
            linear = linear[idx].cpu().data.numpy()
            spectrogram = prepare_spec_image(audio._denormalize(linear))
            self.writer.add_image("Target spectrogram", spectrogram,
                                  global_epoch)
            # Target audio signal
            signal = audio.inv_spectrogram(linear.T)
            signal /= np.max(np.abs(signal))
            try:
                self.writer.add_audio("Target audio signal",
                                      signal,
                                      global_epoch,
                                      sample_rate=self.fs)
            except Exception as e:
                warn(str(e))
                pass
def tts(model, text, p=0):
    """Convert text to speech waveform given a deepvoice3 model.
    """
    if use_cuda:
        model = model.cuda()
    model.eval()

    sequence = np.array(_frontend.text_to_sequence(text, p=p))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long()
    text_positions = Variable(text_positions)
    if use_cuda:
        sequence = sequence.cuda()
        text_positions = text_positions.cuda()

    # Greedy decoding
    mel_outputs, linear_outputs, alignments, done = model(
        sequence, text_positions=text_positions)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()
    mel = mel_outputs[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram, mel
Beispiel #4
0
def tts(model, text, p=0):
    """Convert text to speech waveform given a deepvoice3 model.

    Args:
        text (str) : Input text to be synthesized
        p (float) : Replace word to pronounciation if p > 0. Default is 0.
    """
    if use_cuda:
        model = model.cuda()
    model.eval()
    model.make_generation_fast_()

    sequence = np.array(_frontend.text_to_sequence(text, p=p))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long()
    text_positions = Variable(text_positions)
    if use_cuda:
        sequence = sequence.cuda()
        text_positions = text_positions.cuda()

    # Greedy decoding
    mel_outputs, linear_outputs, alignments, done = model(
        sequence, text_positions=text_positions)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()
    mel = mel_outputs[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram, mel
Beispiel #5
0
def tts(model, text, p=0, speaker_id=None, fast=False):
    """Convert text to speech waveform given a deepvoice3 model.

    Args:
        text (str) : Input text to be synthesized
        p (float) : Replace word to pronounciation if p > 0. Default is 0.
    """
    model = model.to(device)
    model.eval()
    if fast:
        model.make_generation_fast_()

    sequence = np.array(_frontend.text_to_sequence(text, p=p))
    sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device)
    text_positions = torch.arange(1,
                                  sequence.size(-1) +
                                  1).unsqueeze(0).long().to(device)
    speaker_ids = None if speaker_id is None else torch.LongTensor(
        [speaker_id]).to(device)

    # Greedy decoding
    with torch.no_grad():
        mel_outputs, linear_outputs, alignments, done = model(
            sequence, text_positions=text_positions, speaker_ids=speaker_ids)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()
    mel = mel_outputs[0].cpu().data.numpy()
    mel = audio._denormalize(mel)

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram, mel
Beispiel #6
0
def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(
        log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(
        log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T)
    save_audio(waveform, audio_path)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=True),
                            isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=False),
                            isKorean=False)
  def synthesize(self, text, speaker_id=0):
    """Convert text to speech waveform given a deepvoice3 model.

    Args:
        text (str) : Input text to be synthesized
        p (float) : Replace word to pronounciation if p > 0. Default is 0.
    """
    sequence = np.array(self._frontend.text_to_sequence(text))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long()
    text_positions = Variable(text_positions)
    speaker_ids = None if speaker_id is None else Variable(torch.LongTensor([speaker_id]))
    if self.use_cuda:
        sequence = sequence.cuda()
        text_positions = text_positions.cuda()
        speaker_ids = None if speaker_ids is None else speaker_ids.cuda()

    # Greedy decoding
    mel_outputs, linear_outputs, alignments, done = self.model(
        sequence, text_positions=text_positions, speaker_ids=speaker_ids)

    linear_output = linear_outputs[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)
    out = io.BytesIO()
    audio.save_wav(waveform, out)
    return out
Beispiel #8
0
    def eval_batch(self,
                   batch_x,
                   batch_xl,
                   batch_ym=None,
                   batch_ys=None,
                   batch_yl=None):

        time_start = time()

        logging.debug('batch_x.shape=%s, batch_xl.shape=%s' %
                      (batch_x.shape, batch_xl.shape))

        # if self.write_debug_files:
        #     np.save('eval_x', self.batch_x[0])
        #     logging.debug ('eval_x.npy written.')
        #     np.save('eval_xl', self.batch_xl[0])
        #     logging.debug ('eval_xl.npy written.')

        logging.debug(u'%fs self.session.run...' % (time() - time_start))
        if batch_ym is None:
            spectrograms = self.sess.run(fetches=self.linear_outputs,
                                         feed_dict={
                                             self.inputs: batch_x,
                                             self.input_lengths: batch_xl,
                                         })
        else:
            step_out, loss_out, opt_out, spectrograms, alignment = self.sess.run(
                [
                    self.global_step, self.loss, self.optimize,
                    self.linear_outputs, self.alignments
                ],
                feed_dict={
                    self.inputs: batch_x,
                    self.input_lengths: batch_xl,
                    self.mel_targets: batch_ym,
                    self.linear_targets: batch_ys,
                    self.target_lengths: batch_yl
                })

        logging.debug(u'generating wav for %s' % self.decode_input(batch_x[0]))

        spectrogram = spectrograms[0]

        logging.debug('spectrogram.shape=%s' % repr(spectrogram.shape))

        # if self.write_debug_files:
        #     np.save('eval_spectrogram', spectrogram)
        #     logging.debug ('eval_spectrogram.npy written.')

        logging.debug(u'%fs audio.inv_spectrogram...' % (time() - time_start))
        wav = audio.inv_spectrogram(spectrogram.T, self.hp)

        logging.debug(u'%fs wav.' % (time() - time_start))

        return wav
Beispiel #9
0
def tts(model, text, p=0, speaker_id=None, fast=False, wavenet=None):
    """Convert text to speech waveform given a deepvoice3 model.

    Args:
        text (str) : Input text to be synthesized
        p (float) : Replace word to pronounciation if p > 0. Default is 0.
    """
    model = model.to(device)
    model.eval()
    if fast:
        model.make_generation_fast_()

    sequence = np.array(_frontend.text_to_sequence(text, p=p))
    print('sequence to synthesize: ', sequence)
    sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device)
    text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(device)
    speaker_ids = None if speaker_id is None else torch.LongTensor([speaker_id]).to(device)

    # Greedy decoding
    with torch.no_grad():
        mel_outputs, linear_outputs, alignments, done = model(
            sequence, text_positions=text_positions, speaker_ids=speaker_ids)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()
    mel = mel_outputs[0].cpu().data.numpy()
    mel = audio._denormalize(mel)

    # Predicted audio signal
    if wavenet is not None:
        wavenet = wavenet.to(device)
        wavenet.eval()
        if fast:
            wavenet.make_generation_fast_()

        # TODO: assuming scalar input
        initial_value = 0.0
        initial_input = torch.zeros(1, 1, 1).fill_(initial_value).to(device)
        # (B, T, C) -> (B, C, T)
        c = mel_outputs.transpose(1, 2).contiguous()
        g = None
        Tc = c.size(-1)
        length = Tc * 256
        initial_input = initial_input.to(device)
        c = c.to(device)
        waveform = wavenet.incremental_forward(
            initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True,
            log_scale_min=float(np.log(1e-14)))
        waveform = waveform.view(-1).cpu().data.numpy()
    else:
        waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram, mel
Beispiel #10
0
    def say(self, txt, trim_silence=True, dyn_range_compress=True):

        time_start = time()

        logging.debug(u'%fs synthesizing %s' % (time() - time_start, txt))

        input_data = np.zeros((1, self.hp['max_inp_len']), dtype='int32')
        input_lengths = np.zeros((1, ), dtype='int32')

        logging.debug('input_data.shape=%s, input_lengths.shape=%s' %
                      (input_data.shape, input_lengths.shape))

        self._encode_input(txt, 0, input_data, input_lengths)

        logging.debug('input_data=%s input_lengths=%s' %
                      (input_data[0], input_lengths[0]))

        if self.write_debug_files:
            np.save('say_x', input_data[0])
            logging.debug('say_x.npy written.')
            np.save('say_xl', input_lengths[0])
            logging.debug('say_xl.npy written.')

        logging.debug(u'%fs self.session.run...' % (time() - time_start))
        spectrograms = self.sess.run(fetches=self.linear_outputs,
                                     feed_dict={
                                         self.inputs: input_data,
                                         self.input_lengths: input_lengths,
                                     })
        spectrogram = spectrograms[0]

        logging.debug('spectrogram.shape=%s' % repr(spectrogram.shape))

        if self.write_debug_files:
            np.save('say_spectrogram', spectrogram)
            logging.debug('say_spectrogram.npy written.')

        # np.set_printoptions(threshold=np.inf)

        logging.debug(u'%fs audio.inv_spectrogram...' % (time() - time_start))
        wav = audio.inv_spectrogram(spectrogram.T, self.hp, use_fgla=True)

        if dyn_range_compress:
            logging.debug(u'%fs dynamic range compression...' %
                          (time() - time_start))
            wav = audio.dyn_range_compress(wav, self.hp)

        if trim_silence:
            logging.debug(u'%fs trim silence...' % (time() - time_start))
            wav = audio.trim_silence(wav, self.hp)

        logging.debug(u'%fs wav.' % (time() - time_start))
        return wav
Beispiel #11
0
def save_states(global_step,
                mel_outputs,
                linear_outputs,
                attn,
                y,
                input_lengths,
                checkpoint_dir=None):
    print("Save intermediate states at step {}".format(global_step))

    # idx = np.random.randint(0, len(input_lengths))
    idx = min(1, len(input_lengths) - 1)
    input_length = input_lengths[idx]

    # Alignment
    # Multi-hop attention
    if attn.dim() == 4:
        for i, alignment in enumerate(attn):
            alignment_dir = join(checkpoint_dir,
                                 "alignment_layer{}".format(i + 1))
            os.makedirs(alignment_dir, exist_ok=True)
            path = join(
                alignment_dir,
                "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1))
            alignment = alignment[idx].cpu().data.numpy()
            save_alignment(path, alignment)

        # Save averaged alignment
        alignment_dir = join(checkpoint_dir, "alignment_ave")
        os.makedirs(alignment_dir, exist_ok=True)
        path = join(alignment_dir,
                    "step{:09d}_alignment.png".format(global_step))
        alignment = attn.mean(0)[idx].cpu().data.numpy()
        save_alignment(path, alignment)
    else:
        assert False

    # Predicted spectrogram
    path = join(checkpoint_dir,
                "step{:09d}_predicted_spectrogram.png".format(global_step))
    linear_output = linear_outputs[idx].cpu().data.numpy()
    save_spectrogram(path, linear_output)

    # Predicted audio signal
    signal = audio.inv_spectrogram(linear_output.T)
    path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(global_step))
    audio.save_wav(signal, path)

    # Target spectrogram
    path = join(checkpoint_dir,
                "step{:09d}_target_spectrogram.png".format(global_step))
    linear_output = y[idx].cpu().data.numpy()
    save_spectrogram(path, linear_output)
Beispiel #12
0
def tts(model, text, p=0., speaker_id=None):
    """
    Convert text to speech waveform given a deepvoice3 model.

    Args:
        model (DeepVoiceTTS): Model used to synthesize waveform.
        text (str) : Input text to be synthesized
        p (float) : Replace word to pronounciation if p > 0. Default is 0.
    
    Returns:
        waveform (numpy.ndarray): Shape(T_wav, ), predicted wave form, where
            T_wav means the length of the synthesized wave form.
        alignment (numpy.ndarray): Shape(T_dec, T_enc), predicted alignment
            matrix, where T_dec means the time steps of decoder outputs, T_enc
            means the time steps of encoder outoputs.
        spectrogram (numpy.ndarray): Shape(T_lin, C_lin), predicted linear
            spectrogram, where T__lin means the time steps of linear
            spectrogram and C_lin mean sthe channels of linear spectrogram.
        mel (numpy.ndarray): Shape(T_mel, C_mel), predicted mel spectrogram,
            where T_mel means the time steps of mel spectrogram and C_mel means
            the channels of mel spectrogram.
    """
    model.eval()

    sequence = np.array(_frontend.text_to_sequence(text, p=p)).astype("int64")
    sequence = np.reshape(sequence, (1, -1))
    text_positions = np.arange(1, sequence.shape[1] + 1, dtype="int64")
    text_positions = np.reshape(text_positions, (1, -1))

    sequence = dg.to_variable(sequence)
    text_positions = dg.to_variable(text_positions)
    speaker_ids = None if speaker_id is None else fluid.layers.fill_constant(
        shape=[1, 1], value=speaker_id)

    # sequence: shape(1, input_length, 1)
    # text_positions: shape(1, input_length, 1)
    # Greedy decoding
    mel_outputs, linear_outputs, alignments, done = model.transduce(
        sequence, text_positions, speaker_ids)

    # reshape to the desired shape
    linear_output = linear_outputs.numpy().squeeze().T
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments.numpy()[0]
    mel = mel_outputs.numpy().squeeze().T
    mel = audio._denormalize(mel)

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram, mel
Beispiel #13
0
def	save_and_plot_fn(args, log_dir, step, loss, prefix):
	idx, (seq, spec, align) = args

	audio_path = os.path.join(
		log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
	align_path = os.path.join(
		log_dir, '{}-step-{:09d}-audio{:03d}.png'.format(prefix, step, idx))

	waveform = inv_spectrogram(spec.T)
	save_audio(waveform, audio_path)

	info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
	plot.plot_alignment(
		align, align_path, info=info_text,
		text=sequence_to_text(seq,
			skip_eos_and_pad=True, combine_jamo=True))
def copy_synthesis(wav_file, out_path):
    """Perform copy synthesis on the wav file and write the synthesized wav to disk at out_path
    """
    filename = os.path.splitext(os.path.basename(wav_file))[0]

    y = audio.load_wav(wav_file)
    if cfg.rescaling:
        y = y / np.abs(y).max() * cfg.rescaling_max

    mag = audio.spectrogram(y)

    y_hat = audio.inv_spectrogram(mag)

    out_path = os.path.join(out_path, filename + "_synthesized.wav")
    print(f"Writing {out_path} to disk")
    audio.save_wav(y_hat, out_path)
def save_states(global_step, attn, linear_outputs, input_lengths, logs_dir):
    """Save intermediate states
    """
    print(f"Save intermediate states at step {global_step:09d}")

    idx = min(1, len(input_lengths) - 1)

    # Alignment
    # Multi-hop attention
    if attn is not None and attn.dim() == 4:
        for i, alignment in enumerate(attn):
            # Save alignment to disk
            alignment = alignment[idx].cpu().data.numpy()

            alignment_dir = join(logs_dir, f"alignment_layer{i + 1}")
            os.makedirs(alignment_dir, exist_ok=True)

            path = join(alignment_dir,
                        f"step{global_step:09d}_layer_{i + 1}_alignment.png")

            save_alignment(path, alignment)

        # Save averaged alignment
        alignment_dir = join(logs_dir, "alignment_ave")
        os.makedirs(alignment_dir, exist_ok=True)

        path = join(alignment_dir,
                    f"step{global_step:09d}_layer_alignment.png")

        alignment = attn.mean(0)[idx].cpu().data.numpy()

        save_alignment(path, alignment)

    linear_output = linear_outputs[idx].cpu().data.numpy()

    # Predicted audio signal
    signal = audio.inv_spectrogram(linear_output.T)
    signal /= np.max(np.abs(signal))

    wavs_dir = join(logs_dir, "wavs")
    os.makedirs(wavs_dir, exist_ok=True)

    path = join(wavs_dir, f"step{global_step:09d}_predicted.wav")

    audio.save_wav(signal, path)
Beispiel #16
0
def generate(model_path,model_name, generate_path, generate_name, piece):
    
    """Synthesize audio from an array of embeddings.
    
    Args:
    encodings: Numpy array with shape [batch_size, time, dim].
    save_paths: Iterable of output file names.
    checkpoint_path: Location of the pretrained model. [model.ckpt-200000]
    samples_per_save: Save files after every amount of generated samples.

    """
    
    # Create directory for encoding
    if os.path.exists(generate_path) is False:
        os.makedirs(generate_path)

    net = AutoEncoder()
    net = load_model(net,model_path,model_name)
    cuda_available = torch.cuda.is_available()
    if cuda_available is True:
        net = net.cuda()

    net.eval()

    # Load audio for encoding
    piece = audio.load_wav(piece)
    spec = audio.spectrogram(piece).astype(np.float32)
    spec = torch.from_numpy(spec.T)
    spec = torch.FloatTensor(spec)
    
    spec = torch.unsqueeze(spec, 0)
    spec = Variable(spec, volatile=True).contiguous()

    if cuda_available is True:
        spec = spec.cuda()

    generated_spec = net(spec)
    generated_spec = generated_spec.data.cpu().numpy()
    generated_spec = np.squeeze(generated_spec)
    
    waveform = audio.inv_spectrogram(generated_spec.T)
    wav_name = generate_path + generate_name + '.wav'

    audio.save_wav(waveform , wav_name)    
Beispiel #17
0
def tts(model, text, speaker_id=None, fast=False):
    """Convert text to speech waveform given a deepvoice3 model.
    """
    model = model.to(device)
    model.eval()

    if fast:
        model.make_generation_fast_()

    if cfg.frontend == "en":
        sequence = np.array(english.text_to_sequence(text))
    else:
        raise NotImplementedError

    sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device)
    text_positions = torch.arange(1,
                                  sequence.size(-1) +
                                  1).unsqueeze(0).long().to(device)

    speaker_ids = None if speaker_id is None else torch.LongTensor(
        [speaker_id]).to(device)

    # Greedy decoding
    with torch.no_grad():
        mel_outputs, linear_outputs, alignments, done = model(
            sequence, text_positions=text_positions, speaker_ids=speaker_ids)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()
    mel = mel_outputs[0].cpu().data.numpy()
    mel = audio._denormalize(mel)

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram, mel
Beispiel #18
0
    MEL_DIR = join(train_dir, 'Acoustic_frame/mel')
    LINEAR_DIR = join(train_dir, 'Acoustic_frame/linear')
    ling_name = ling + '.npy'
    ling = np.load(join(LING_DIR, ling_name))
    ling = norm_minmax(ling,
                       np.load(join(train_dir, 'stat_linguistic_frame.npy')))
    ling = torch.from_numpy(ling).unsqueeze(0).to(device)
    speaker_list = ['ema', 'emb', 'emc', 'emd', 'eme']
    emotions = [0, 1, 2, 3]

    for ref_spk in speaker_list:
        for emo in emotions:
            spk_emo = '{}00{}27.npy'.format(ref_spk, str(emo))
            mel = np.load(join(MEL_DIR, spk_emo))
            mel = torch.from_numpy(mel).unsqueeze(0).to(device)
            _, _, linear_output = model(ling, mel)

            linear_output = linear_output[0].data.cpu().numpy()
            signal = audio.inv_spectrogram(linear_output.T)
            signal /= np.max(np.abs(signal))
            path = join(result_dir, spk_emo.replace('.npy', '.wav'))
            audio.save_wav(signal, path)

            linear = np.load(join(LINEAR_DIR, spk_emo))
            signal = audio.inv_spectrogram(linear.T)
            signal /= np.max(np.abs(signal))
            path = join(result_dir, spk_emo.replace('.npy', '_refer.wav'))
            audio.save_wav(signal, path)

            print('%s' % spk_emo)
def plot_graph_and_save_audio(args,
                              base_path=None,
                              start_of_sentence=None,
                              end_of_sentence=None,
                              pre_word_num=0,
                              post_word_num=0,
                              pre_surplus_idx=0,
                              post_surplus_idx=1,
                              use_short_concat=False,
                              use_manual_attention=False,
                              save_alignment=False,
                              librosa_trim=False,
                              attention_trim=False,
                              time_str=None,
                              isKorean=True):

    idx, (wav, alignment, path, text, sequence) = args

    if base_path:
        plot_path = "{}/{}.png".format(base_path, get_time())
    elif path:
        plot_path = path.rsplit('.', 1)[0] + ".png"
    else:
        plot_path = None

    #plot_path = add_prefix(plot_path, time_str)
    if use_manual_attention:
        plot_path = add_postfix(plot_path, "manual")

    if plot_path:
        plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)

    if use_short_concat:
        wav = short_concat(wav, alignment, text, start_of_sentence,
                           end_of_sentence, pre_word_num, post_word_num,
                           pre_surplus_idx, post_surplus_idx)

    if attention_trim and end_of_sentence:
        end_idx_counter = 0
        attention_argmax = alignment.argmax(0)
        end_idx = min(len(sequence) - 1, max(attention_argmax))
        max_counter = min((attention_argmax == end_idx).sum(), 5)

        for jdx, attend_idx in enumerate(attention_argmax):
            if len(attention_argmax) > jdx + 1:
                if attend_idx == end_idx:
                    end_idx_counter += 1

                if attend_idx == end_idx and attention_argmax[jdx +
                                                              1] > end_idx:
                    break

                if end_idx_counter >= max_counter:
                    break
            else:
                break

        spec_end_idx = hparams.reduction_factor * jdx + 3
        wav = wav[:spec_end_idx]

    audio_out = inv_spectrogram(wav.T)

    if librosa_trim and end_of_sentence:
        yt, index = librosa.effects.trim(audio_out,
                                         frame_length=5120,
                                         hop_length=256,
                                         top_db=50)
        audio_out = audio_out[:index[-1]]

    if save_alignment:
        alignment_path = "{}/{}.npy".format(base_path, idx)
        np.save(alignment_path, alignment, allow_pickle=False)

    if path or base_path:
        if path:
            current_path = add_postfix(path, idx)
        elif base_path:
            current_path = plot_path.replace(".png", ".wav")

        save_audio(audio_out, current_path)
        return True
    else:
        io_out = io.BytesIO()
        save_audio(audio_out, io_out)
        result = io_out.getvalue()
        return result
Beispiel #20
0
def save_waveform_from_spec(spectrogram, filename):
    waveform = audio.inv_spectrogram(spectrogram)
    audio.save_wav(waveform, filename)
Beispiel #21
0
        processed_to_raw_map = {
            idx: speaker
            for idx, speaker in enumerate(vctk.available_speakers)
        }

        for i in range(5):
            random_voice_idx = random.randint(0, len(Mel))
            random_speaker_id = X[random_voice_idx][1]
            lin = Y[random_voice_idx]
            mel = Mel[random_voice_idx]

            speaker_dir = join(current_model_dir,
                               'speaker{}'.format(random_speaker_id))
            os.makedirs(speaker_dir, exist_ok=True)

            audio.save_wav(audio.inv_spectrogram(lin.T),
                           join(speaker_dir, 'sample_voice.wav'))

            with open(text_list_file_path, "rb") as f:
                lines = f.readlines()
                for idx, line in enumerate(lines):
                    text = line.decode("utf-8")[:-1]
                    waveform, alignment, _, _ = tts(
                        model,
                        mel,
                        text,
                        p=replace_pronunciation_prob,
                        fast=False)
                    dst_wav_path = join(speaker_dir, "text{}.wav".format(idx))
                    audio.save_wav(waveform, dst_wav_path)
Beispiel #22
0
  File "/Users/huangshengjie/Desktop/测试lmdb/第三方mel-griffin.py", line 203, in <module>
    wav = spectrogram2wav(mag)
  File "/Users/huangshengjie/Desktop/测试lmdb/第三方mel-griffin.py", line 112, in spectrogram2wav
    wav = griffin_lim(mag)
  File "/Users/huangshengjie/Desktop/测试lmdb/第三方mel-griffin.py", line 134, in griffin_lim
    X_t = invert_spectrogram(X_best)
  File "/Users/huangshengjie/Desktop/测试lmdb/第三方mel-griffin.py", line 155, in invert_spectrogram
    return librosa.istft(spectrogram, hop_length, win_length=win_length)
  File "/Users/huangshengjie/opt/anaconda3/envs/py36/lib/python3.6/site-packages/librosa/core/spectrum.py", line 288, in istft
    ifft_window = util.pad_center(ifft_window, n_fft)
  File "/Users/huangshengjie/opt/anaconda3/envs/py36/lib/python3.6/site-packages/librosa/util/utils.py", line 304, in pad_center
    'at least input size ({:d})').format(size, n))
librosa.util.exceptions.ParameterError: Target size (894) must be at least input size (1000)
'''
# wav = spectrogram2wav(mag)  # 两个都是空白
wav = audio.inv_spectrogram(mag)

audio.save_wav(wav, './hello2.wav')
'''
第三方数据:

(318帧, 512维)
mel.shape =  (318, 512)
mel.T.shape =  (512, 318)

# 当 n_mels = 80:
(318, 80)
mel.shape =  (318, 80)
mel.T.shape =  (80, 318)
'''
'''
Beispiel #23
0
def save_states(global_step,
                mel_outputs,
                linear_outputs,
                attn,
                mel,
                y,
                input_lengths,
                checkpoint_dir=None):
    print("Save intermediate states at step {}".format(global_step))

    # idx = np.random.randint(0, len(input_lengths))
    idx = min(1, len(input_lengths) - 1)
    input_length = input_lengths[idx]

    # Alignment
    # Multi-hop attention
    if attn is not None and attn.dim() == 4:
        for i, alignment in enumerate(attn):
            alignment = alignment[idx].cpu().data.numpy()
            tag = "alignment_layer{}".format(i + 1)

            # save files as well for now
            alignment_dir = join(checkpoint_dir,
                                 "alignment_layer{}".format(i + 1))
            os.makedirs(alignment_dir, exist_ok=True)
            path = join(
                alignment_dir,
                "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1))
            save_alignment(path, alignment)

        # Save averaged alignment
        alignment_dir = join(checkpoint_dir, "alignment_ave")
        os.makedirs(alignment_dir, exist_ok=True)
        path = join(alignment_dir,
                    "step{:09d}_alignment.png".format(global_step))
        alignment = attn.mean(0)[idx].cpu().data.numpy()
        save_alignment(path, alignment)

        tag = "averaged_alignment"

    # Predicted mel spectrogram
    if mel_outputs is not None:
        mel_output = mel_outputs[idx].cpu().data.numpy()
        mel_output = prepare_spec_image(audio._denormalize(mel_output))

    # Predicted spectrogram
    if linear_outputs is not None:
        linear_output = linear_outputs[idx].cpu().data.numpy()
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))

        # Predicted audio signal
        signal = audio.inv_spectrogram(linear_output.T)
        signal /= np.max(np.abs(signal))
        path = join(checkpoint_dir,
                    "step{:09d}_predicted.wav".format(global_step))

        audio.save_wav(signal, path)

    # Target mel spectrogram
    if mel_outputs is not None:
        mel_output = mel[idx].cpu().data.numpy()
        mel_output = prepare_spec_image(audio._denormalize(mel_output))

    # Target spectrogram
    if linear_outputs is not None:
        linear_output = y[idx].cpu().data.numpy()
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
Beispiel #24
0
    def train(self, num_epochs=DEFAULT_NUM_EPOCHS):

        logging.info('counting steps...')

        num_steps = 0
        while True:
            if os.path.exists(DSFN_X % (self.voice, num_steps)):
                num_steps += 1
            else:
                break

        logging.info('counting steps... %d steps found.' % num_steps)

        if DEBUG_LIMIT:
            logging.warn('limiting number of steps to %d for debugging' %
                         DEBUG_LIMIT)
            num_steps = DEBUG_LIMIT

        batch_size = self.hp['batch_size']
        max_inp_len = self.hp['max_inp_len']
        max_num_frames = self.hp['max_iters'] * self.hp[
            'outputs_per_step'] * self.hp['frame_shift_ms'] * self.hp[
                'sample_rate'] / 1000

        n_fft, hop_length, win_length = audio.stft_parameters(self.hp)
        max_mfc_frames = 1 + int((max_num_frames - n_fft) / hop_length)

        batch_x = np.zeros((batch_size, max_inp_len), dtype='int32')
        batch_xl = np.zeros((batch_size, ), dtype='int32')
        batch_ys = np.zeros((batch_size, max_mfc_frames, self.hp['num_freq']),
                            dtype='float32')
        batch_ym = np.zeros((batch_size, max_mfc_frames, self.hp['num_mels']),
                            dtype='float32')
        batch_yl = np.zeros((batch_size, ), dtype='int32')

        sample_idxs = range(0, num_steps)

        for epoch_idx in range(num_epochs):
            epoch = self.epoch_start + epoch_idx

            random.shuffle(sample_idxs)
            epoch_loss = 0
            num_batches = 0

            for i, sample_idx in enumerate(sample_idxs):

                x = np.load(DSFN_X % (self.voice, sample_idx))
                xl = np.load(DSFN_XL % (self.voice, sample_idx))
                ys = np.load(DSFN_YS % (self.voice, sample_idx))
                ym = np.load(DSFN_YM % (self.voice, sample_idx))
                yl = np.load(DSFN_YL % (self.voice, sample_idx))

                batch_x[i % batch_size] = x[0]
                batch_xl[i % batch_size] = xl[0]
                batch_ys[i % batch_size] = ys[0]
                batch_ym[i % batch_size] = ym[0]
                batch_yl[i % batch_size] = yl[0]

                if (i % batch_size) == (batch_size - 1):

                    num_batches += 1

                    ts = self.decode_input(x[0])
                    logging.debug(u'ts %d %s' % (sample_idx, ts))

                    step_out, loss_out, opt_out, spectrogram, alignment = self.sess.run(
                        [
                            self.global_step, self.loss, self.optimize,
                            self.linear_outputs, self.alignments
                        ],
                        feed_dict={
                            self.inputs: batch_x,
                            self.input_lengths: batch_xl,
                            self.mel_targets: batch_ym,
                            self.linear_targets: batch_ys,
                            self.target_lengths: batch_yl
                        })

                    epoch_loss += loss_out

                    logging.info(
                        'epoch: %5d, step %4d/%4d loss: %7.5f, avg loss: %7.5f'
                        % (epoch, i + 1, num_steps, loss_out,
                           epoch_loss / num_batches))

            cpfn = CHECKPOINT_FN % (self.voice, epoch)
            logging.info('Saving checkpoint to: %s' % cpfn)
            self.saver.save(self.sess, cpfn, global_step=step_out)

            logging.info('Saving audio and alignment...')

            # import pdb; pdb.set_trace()

            # input_seq, spectrogram, alignment = sess.run([inputs, input_lengths, linear_outputs, alignments],
            #                                              feed_dict={inputs         : eval_x,
            #                                                         input_lengths  : eval_xl,
            #                                                         mel_targets    : eval_ym,
            #                                                         linear_targets : eval_ys})

            waveform = audio.inv_spectrogram(spectrogram[0].T, self.hp)

            wavfn = WAV_FN % (self.voice, epoch)
            audio.save_wav(waveform, wavfn, self.hp)
            logging.info('%s written.' % wavfn)

            specfn = SPEC_FN % (self.voice, epoch)
            cmd = 'sox %s -n spectrogram -o %s' % (wavfn, specfn)
            logging.info(cmd)
            os.system(cmd)

            # import pdb; pdb.set_trace()

            plotfn = ALIGN_FN % (self.voice, epoch)
            self._plot_alignment(alignment[0],
                                 plotfn,
                                 info='epoch=%d, loss=%.5f' %
                                 (epoch, loss_out))
            logging.info('alignment %s plotted to %s' %
                         (alignment[0].shape, plotfn))

            # save batch as well so we can debug training later if needed
            np.save(BATCH_X_FN % (self.voice, epoch), batch_x)
            logging.info('%s written.' % (BATCH_X_FN % (self.voice, epoch)))
            np.save(BATCH_XL_FN % (self.voice, epoch), batch_xl)
            logging.info('%s written.' % (BATCH_XL_FN % (self.voice, epoch)))
            np.save(BATCH_YM_FN % (self.voice, epoch), batch_ym)
            logging.info('%s written.' % (BATCH_YM_FN % (self.voice, epoch)))
            np.save(BATCH_YS_FN % (self.voice, epoch), batch_ys)
            logging.info('%s written.' % (BATCH_YS_FN % (self.voice, epoch)))
            np.save(BATCH_YL_FN % (self.voice, epoch), batch_yl)
            logging.info('%s written.' % (BATCH_YL_FN % (self.voice, epoch)))
Beispiel #25
0
def save_states(global_step, writer, mel_outputs, linear_outputs, attn, mel, y,
                input_lengths, checkpoint_dir=None):
    print("Save intermediate states at step {}".format(global_step))

    # idx = np.random.randint(0, len(input_lengths))
    idx = min(1, len(input_lengths) - 1)
    input_length = input_lengths[idx]

    # Alignment
    # Multi-hop attention
    if attn is not None and attn.dim() == 4:
        for i, alignment in enumerate(attn):
            alignment = alignment[idx].cpu().data.numpy()
            tag = "alignment_layer{}".format(i + 1)
            writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step)

            # save files as well for now
            alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1))
            os.makedirs(alignment_dir, exist_ok=True)
            path = join(alignment_dir, "step{:09d}_layer_{}_alignment.png".format(
                global_step, i + 1))
            save_alignment(path, alignment)

        # Save averaged alignment
        alignment_dir = join(checkpoint_dir, "alignment_ave")
        os.makedirs(alignment_dir, exist_ok=True)
        path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step))
        alignment = attn.mean(0)[idx].cpu().data.numpy()
        save_alignment(path, alignment)

        tag = "averaged_alignment"
        writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step)

    # Predicted mel spectrogram
    if mel_outputs is not None:
        mel_output = mel_outputs[idx].cpu().data.numpy()
        if hparams.vocoder != "world":
            mel_output = prepare_spec_image(audio._denormalize(mel_output))
            writer.add_image("Predicted mel spectrogram", mel_output, global_step)
        else:
            mel_output_prep = mel_output
            try:
                writer.add_image("Predicted WORLD output", mel_output_prep, global_step)
            except:
                pass

            mel_output = denormalize(mel_output)
            nfft = pw.get_cheaptrick_fft_size(hparams.sample_rate)
            f0 = mel_output[:,0].astype(np.float64)
            sp = pw.decode_spectral_envelope(mel_output[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft)
            ap = pw.decode_aperiodicity(mel_output[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft)

            signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period)
            path = join(checkpoint_dir, "step{:09d}_out.wav".format(
                        global_step))
            audio.save_wav(signal, path)

            try:
                signal /= np.max(np.abs(signal))
                writer.add_audio("Target audio signal", signal, global_step, sample_rate=fs)
            except:
                print("Unexpected error :", sys.exc_info())

            mel_tgt = mel[idx].cpu().data.numpy()
            mel_tgt = denormalize(mel_tgt)

            f0 = mel_tgt[:,0].astype(np.float64)
            sp = pw.decode_spectral_envelope(mel_tgt[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft)
            ap = pw.decode_aperiodicity(mel_tgt[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft)

            signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period)
            try:
                signal /= np.max(np.abs(signal))
                writer.add_audio("Target audio signal", signal, global_step, sample_rate=hparams.sample_rate)
            except:
                print("Unexpected error :", sys.exc_info())
    # Predicted spectrogram
    if linear_outputs is not None:
        linear_output = linear_outputs[idx].cpu().data.numpy()
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
        writer.add_image("Predicted linear spectrogram", spectrogram, global_step)

        # Predicted audio signal
        signal = audio.inv_spectrogram(linear_output.T)
        signal /= np.max(np.abs(signal))
        path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(
            global_step))
        try:
            writer.add_audio("Predicted audio signal", signal, global_step, sample_rate=fs)
        except Exception as e:
            warn(str(e))
            pass
        audio.save_wav(signal, path)

    # Target mel spectrogram
    if mel_outputs is not None:
        mel_output = mel[idx].cpu().data.numpy()
        mel_output = prepare_spec_image(audio._denormalize(mel_output))
        writer.add_image("Target mel spectrogram", mel_output, global_step)

    # Target spectrogram
    if linear_outputs is not None:
        linear_output = y[idx].cpu().data.numpy()
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
        writer.add_image("Target linear spectrogram", spectrogram, global_step)

    #ei
    path = join(checkpoint_dir, "step{:09d}_mel_target.npy".format(
                global_step))
    mel_output = mel[idx].cpu().data.numpy()
    np.save(path, denormalize(mel_output))

    path = join(checkpoint_dir, "step{:09d}_mel_out.npy".format(
                global_step))
    mel_output = denormalize(mel_outputs[idx].cpu().data.numpy())
    np.save(path, mel_output)
Beispiel #26
0
def save_states(global_step,
                writer,
                mel_outputs,
                converter_outputs,
                attn,
                mel,
                y,
                input_lengths,
                checkpoint_dir=None):
    def save_world(tuple_outputs, save_str, global_step=global_step):
        _, tar_f0, tar_sp, tar_ap = tuple_outputs
        fig = plt.figure()
        f0 = tar_f0[idx].cpu().data.numpy() * 400
        ax = fig.add_subplot(1, 1, 1)
        ax.plot(f0)
        save_f0 = save_str + ' f0'
        writer.add_figure(save_f0, fig, global_step)

        # sp save
        sp = tar_sp[idx].cpu().data.numpy()
        s = prepare_spec_image(sp)
        save_sp = save_str + ' sp'
        writer.add_image(save_sp, s.transpose(2, 0, 1), global_step)

        # ap save
        ap = tar_ap[idx].cpu().data.numpy()
        a = prepare_spec_image(ap)
        save_ap = save_str + ' ap'
        writer.add_image(save_ap, a.transpose(2, 0, 1), global_step)

    print("Save intermediate states at step {}".format(global_step))

    # idx = np.random.randint(0, len(input_lengths))
    idx = min(1, len(input_lengths) - 1)
    input_length = input_lengths[idx]

    # Alignment
    # Multi-hop attention
    if attn is not None and attn.dim() == 4:
        for i, alignment in enumerate(attn):
            alignment = alignment[idx].cpu().data.numpy()
            tag = "alignment_layer{}".format(i + 1)
            writer.add_image(tag,
                             np.uint8(cm.viridis(np.flip(alignment, 1)) *
                                      255).T, global_step)  #転置消去

            # save files as well for now
            alignment_dir = join(checkpoint_dir,
                                 "alignment_layer{}".format(i + 1))
            os.makedirs(alignment_dir, exist_ok=True)
            path = join(
                alignment_dir,
                "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1))
            save_alignment(path, alignment, global_step)

    # Predicted mel spectrogram
    if mel_outputs is not None:
        mel_output = mel_outputs[idx].cpu().data.numpy()
        mel_output = prepare_spec_image(audio._denormalize(mel_output))
        writer.add_image("Predicted mel spectrogram",
                         mel_output.transpose(2, 0, 1), global_step)

        #target
        mel_output = mel[idx].cpu().data.numpy()
        mel_output = prepare_spec_image(audio._denormalize(mel_output))
        writer.add_image("Target mel spectrogram",
                         mel_output.transpose(2, 0, 1), global_step)

    if converter_outputs is not None:
        # Predicted world parameter
        if type(converter_outputs) is tuple:
            #save predicted
            save_world(converter_outputs, 'Predicted')

            #save target
            save_world(y, 'Target')

            #save world signal
            _, f0s, sps, aps = converter_outputs
            f0 = f0s[idx].cpu().data.numpy() * 400
            sp = sps[idx].cpu().data.numpy()
            ap = aps[idx].cpu().data.numpy()
            # world vocoder
            signal = audio.world_synthesize(f0, sp, ap)
            signal /= np.max(np.abs(signal))
        # Predicted spectrogram
        else:
            linear_output = converter_outputs[idx].cpu().data.numpy()
            spectrogram = prepare_spec_image(audio._denormalize(linear_output))
            writer.add_image("Predicted linear spectrogram",
                             spectrogram.transpose(2, 0, 1), global_step)

            # Predicted audio signal
            signal = audio.inv_spectrogram(linear_output.T)
            signal /= np.max(np.abs(signal))
            path = join(checkpoint_dir,
                        "step{:09d}_predicted.wav".format(global_step))

            #target
            linear_output = y[idx].cpu().data.numpy()
            spectrogram = prepare_spec_image(audio._denormalize(linear_output))
            writer.add_image("Target linear spectrogram",
                             spectrogram.transpose(2, 0, 1), global_step)

        try:
            writer.add_audio("Predicted audio signal",
                             signal,
                             global_step,
                             sample_rate=hparams.sample_rate)
        except Exception as e:
            warn(str(e))
            pass
        audio.save_wav(signal, path)
Beispiel #27
0
def save_states(global_step,
                writer,
                mel_outputs,
                linear_outputs,
                attn,
                mel,
                y,
                input_lengths,
                checkpoint_dir=None):
    print("Save intermediate states at step {}".format(global_step))

    # idx = np.random.randint(0, len(input_lengths))
    idx = min(1, len(input_lengths) - 1)
    input_length = input_lengths[idx]

    # Alignment
    # Multi-hop attention
    if attn is not None and attn.dim() == 4:
        for i, alignment in enumerate(attn):
            alignment = alignment[idx].cpu().data.numpy()
            tag = "alignment_layer{}".format(i + 1)
            writer.add_image(
                tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),
                global_step)

            # save files as well for now
            alignment_dir = join(checkpoint_dir,
                                 "alignment_layer{}".format(i + 1))
            os.makedirs(alignment_dir, exist_ok=True)
            path = join(
                alignment_dir,
                "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1))
            save_alignment(path, alignment)

        # Save averaged alignment
        alignment_dir = join(checkpoint_dir, "alignment_ave")
        os.makedirs(alignment_dir, exist_ok=True)
        path = join(alignment_dir,
                    "step{:09d}_alignment.png".format(global_step))
        alignment = attn.mean(0)[idx].cpu().data.numpy()
        save_alignment(path, alignment)

        tag = "averaged_alignment"
        writer.add_image(tag,
                         np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),
                         global_step)

    # Predicted mel spectrogram
    if mel_outputs is not None:
        mel_output = mel_outputs[idx].cpu().data.numpy()
        mel_output = prepare_spec_image(audio._denormalize(mel_output))
        writer.add_image("Predicted mel spectrogram", mel_output, global_step)

    # Predicted spectrogram
    if linear_outputs is not None:
        linear_output = linear_outputs[idx].cpu().data.numpy()
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
        writer.add_image("Predicted linear spectrogram", spectrogram,
                         global_step)

        # Predicted audio signal
        signal = audio.inv_spectrogram(linear_output.T)
        signal /= np.max(np.abs(signal))
        path = join(checkpoint_dir,
                    "step{:09d}_predicted.wav".format(global_step))
        try:
            writer.add_audio("Predicted audio signal",
                             signal,
                             global_step,
                             sample_rate=hparams.sample_rate)
        except Exception as e:
            warn(str(e))
            pass
        audio.save_wav(signal, path)

    # Target mel spectrogram
    if mel_outputs is not None:
        mel_output = mel[idx].cpu().data.numpy()
        mel_output = prepare_spec_image(audio._denormalize(mel_output))
        writer.add_image("Target mel spectrogram", mel_output, global_step)

    # Target spectrogram
    if linear_outputs is not None:
        linear_output = y[idx].cpu().data.numpy()
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
        writer.add_image("Target linear spectrogram", spectrogram, global_step)
Beispiel #28
0
def spec_to_wav(decode, wav_name):
    spec = np.load(decode)
    waveform = audio.inv_spectrogram(spec.T)
    audio.save_wav(waveform , wav_name)
Beispiel #29
0
    def eval_model(self, global_epoch, train_seq2seq, train_postnet):
        happy_ref = np.load('../feat/Acoustic_frame/mel/emc00103.npy')
        happy_ref = torch.from_numpy(happy_ref).unsqueeze(0)
        sad_ref = np.load('../feat/Acoustic_frame/mel/ema00203.npy')
        sad_ref = torch.from_numpy(sad_ref).unsqueeze(0)
        angry_ref = np.load('../feat/Acoustic_frame/mel/eme00303.npy')
        angry_ref = torch.from_numpy(angry_ref).unsqueeze(0)
        running_loss = 0.
        running_linear_loss = 0.
        running_mel_loss = 0.
        for step, (ling, mel, linear, lengths,
                   speaker_ids) in enumerate(self.valid_loader):
            self.model.eval()
            ismultispeaker = speaker_ids is not None
            if train_seq2seq:
                ling = ling.to(self.device)
                mel = mel.to(self.device)
                happy_ref = happy_ref.to(self.device)
                sad_ref = sad_ref.to(self.device)
                angry_ref = angry_ref.to(self.device)
            if train_postnet:
                linear = linear.to(self.device)
            lengths = lengths.to(self.device)
            speaker_ids = speaker_ids.to(
                self.device) if ismultispeaker else None
            target_mask = sequence_mask(lengths,
                                        max_len=mel.size(1)).unsqueeze(-1)
            with torch.no_grad():
                # Apply model
                if train_seq2seq and train_postnet:
                    _, mel_outputs, linear_outputs = self.model(
                        ling, mel, speaker_ids=speaker_ids)
                """
                elif train_seq2seq:
                    mel_style = self.model.gst(tmel)
                    style_embed = mel_style.expand_as(smel)
                    mel_input = smel + style_embed
                    mel_outputs = self.model.seq2seq(mel_input)
                    linear_outputs = None
                elif train_postnet:
                    linear_outputs = self.model.postnet(tmel)
                    mel_outputs = None
                """

            # Losses
            if train_seq2seq:
                mel_l1_loss, mel_binary_div = self.spec_loss(
                    mel_outputs, mel, target_mask)
                mel_loss = (1 - self.w) * mel_l1_loss + self.w * mel_binary_div
            if train_postnet:
                linear_l1_loss, linear_binary_div = self.spec_loss(
                    linear_outputs, linear, target_mask)
                linear_loss = (
                    1 - self.w) * linear_l1_loss + self.w * linear_binary_div

            # Combine losses
            if train_seq2seq and train_postnet:
                loss = mel_loss + linear_loss
            elif train_seq2seq:
                loss = mel_loss
            elif train_postnet:
                loss = linear_loss
            running_loss += loss.item()
            running_linear_loss += linear_loss.item()
            running_mel_loss += mel_loss.item()
        B = ling.size(0)
        if ismultispeaker:
            speaker_ids = np.zeros(B)
            speaker_ids = torch.LongTensor(speaker_ids).to(self.device)
        else:
            speaker_ids = None
        _, happy_mel_outputs, happy_linear_outputs = self.model(
            ling, happy_ref, speaker_ids)
        _, sad_mel_outputs, sad_linear_outputs = self.model(
            ling, sad_ref, speaker_ids)
        _, angry_mel_outputs, angry_linear_outputs = self.model(
            ling, angry_ref, speaker_ids)

        if global_epoch % self.eval_interval == 0:
            for idx in range(B):
                if mel_outputs is not None:
                    happy_mel_output = happy_mel_outputs[idx].cpu().data.numpy(
                    )
                    happy_mel_output = prepare_spec_image(
                        audio._denormalize(happy_mel_output))
                    self.writer.add_image(
                        "(Eval) Happy mel spectrogram {}".format(idx),
                        happy_mel_output, global_epoch)

                    sad_mel_output = sad_mel_outputs[idx].cpu().data.numpy()
                    sad_mel_output = prepare_spec_image(
                        audio._denormalize(sad_mel_output))
                    self.writer.add_image(
                        "(Eval) Sad mel spectrogram {}".format(idx),
                        sad_mel_output, global_epoch)

                    angry_mel_output = angry_mel_outputs[idx].cpu().data.numpy(
                    )
                    angry_mel_output = prepare_spec_image(
                        audio._denormalize(angry_mel_output))
                    self.writer.add_image(
                        "(Eval) Angry mel spectrogram {}".format(idx),
                        angry_mel_output, global_epoch)

                    mel_output = mel_outputs[idx].cpu().data.numpy()
                    mel_output = prepare_spec_image(
                        audio._denormalize(mel_output))
                    self.writer.add_image(
                        "(Eval) Predicted mel spectrogram {}".format(idx),
                        mel_output, global_epoch)

                    mel1 = mel[idx].cpu().data.numpy()
                    mel1 = prepare_spec_image(audio._denormalize(mel1))
                    self.writer.add_image(
                        "(Eval) Source mel spectrogram {}".format(idx), mel1,
                        global_epoch)

                if linear_outputs is not None:
                    linear_output = linear_outputs[idx].cpu().data.numpy()
                    spectrogram = prepare_spec_image(
                        audio._denormalize(linear_output))
                    self.writer.add_image(
                        "(Eval) Predicted spectrogram {}".format(idx),
                        spectrogram, global_epoch)
                    signal = audio.inv_spectrogram(linear_output.T)
                    signal /= np.max(np.abs(signal))
                    path = join(
                        self.checkpoint_dir,
                        "epoch{:09d}_{}_predicted.wav".format(
                            global_epoch, idx))
                    audio.save_wav(signal, path)
                    try:
                        self.writer.add_audio(
                            "(Eval) Predicted audio signal {}".format(idx),
                            signal,
                            global_epoch,
                            sample_rate=self.fs)
                    except Exception as e:
                        warn(str(e))
                        pass

                    happy_linear_output = happy_linear_outputs[idx].cpu(
                    ).data.numpy()
                    spectrogram = prepare_spec_image(
                        audio._denormalize(happy_linear_output))
                    self.writer.add_image(
                        "(Eval) Happy spectrogram {}".format(idx), spectrogram,
                        global_epoch)
                    signal = audio.inv_spectrogram(happy_linear_output.T)
                    signal /= np.max(np.abs(signal))
                    path = join(
                        self.checkpoint_dir,
                        "epoch{:09d}_{}_happy.wav".format(global_epoch, idx))
                    audio.save_wav(signal, path)
                    try:
                        self.writer.add_audio(
                            "(Eval) Happy audio signal {}".format(idx),
                            signal,
                            global_epoch,
                            sample_rate=self.fs)
                    except Exception as e:
                        warn(str(e))
                        pass

                    angry_linear_output = angry_linear_outputs[idx].cpu(
                    ).data.numpy()
                    spectrogram = prepare_spec_image(
                        audio._denormalize(angry_linear_output))
                    self.writer.add_image(
                        "(Eval) Angry spectrogram {}".format(idx), spectrogram,
                        global_epoch)
                    signal = audio.inv_spectrogram(angry_linear_output.T)
                    signal /= np.max(np.abs(signal))
                    path = join(
                        self.checkpoint_dir,
                        "epoch{:09d}_{}_angry.wav".format(global_epoch, idx))
                    audio.save_wav(signal, path)
                    try:
                        self.writer.add_audio(
                            "(Eval) Angry audio signal {}".format(idx),
                            signal,
                            global_epoch,
                            sample_rate=self.fs)
                    except Exception as e:
                        warn(str(e))
                        pass

                    sad_linear_output = sad_linear_outputs[idx].cpu(
                    ).data.numpy()
                    spectrogram = prepare_spec_image(
                        audio._denormalize(sad_linear_output))
                    self.writer.add_image(
                        "(Eval) Sad spectrogram {}".format(idx), spectrogram,
                        global_epoch)
                    signal = audio.inv_spectrogram(sad_linear_output.T)
                    signal /= np.max(np.abs(signal))
                    path = join(
                        self.checkpoint_dir,
                        "epoch{:09d}_{}_sad.wav".format(global_epoch, idx))
                    audio.save_wav(signal, path)
                    try:
                        self.writer.add_audio(
                            "(Eval) Sad audio signal {}".format(idx),
                            signal,
                            global_epoch,
                            sample_rate=self.fs)
                    except Exception as e:
                        warn(str(e))
                        pass

                    linear1 = linear[idx].cpu().data.numpy()
                    spectrogram = prepare_spec_image(
                        audio._denormalize(linear1))
                    self.writer.add_image(
                        "(Eval) Target spectrogram {}".format(idx),
                        spectrogram, global_epoch)
                    signal = audio.inv_spectrogram(linear1.T)
                    signal /= np.max(np.abs(signal))
                    try:
                        self.writer.add_audio(
                            "(Eval) Target audio signal {}".format(idx),
                            signal,
                            global_epoch,
                            sample_rate=self.fs)
                    except Exception as e:
                        warn(str(e))
                        pass

        avg_loss = running_loss / len(self.valid_loader)
        avg_linear_loss = running_linear_loss / len(self.valid_loader)
        avg_mel_loss = running_mel_loss / len(self.valid_loader)
        self.writer.add_scalar("valid loss (per epoch)", avg_loss,
                               global_epoch)
        self.writer.add_scalar("valid linear loss (per epoch)",
                               avg_linear_loss, global_epoch)
        self.writer.add_scalar("valid mel loss (per epoch)", avg_mel_loss,
                               global_epoch)
        print("Valid Loss: {}".format(avg_loss))
Beispiel #30
0
    sad = np.load(join(args.mel_dir, sad_name))
    angry = np.load(join(args.mel_dir, angry_name))

    lingX = torch.from_numpy(Xling).unsqueeze(0).to(device)
    melX = torch.from_numpy(melX).unsqueeze(0).to(device)
    happy = torch.from_numpy(happy).unsqueeze(0).to(device)
    sad = torch.from_numpy(sad).unsqueeze(0).to(device)
    angry = torch.from_numpy(angry).unsqueeze(0).to(device)

    style_n, mel_output, linear_output = model(lingX, melX)
    style_h, happy_mel_output, happy_linear_output = model(lingX, happy)
    style_s, sad_mel_output, sad_linear_output = model(lingX, sad)
    style_a, angry_mel_output, angry_linear_output = model(lingX, angry)

    linear_output = linear_output[0].data.cpu().numpy()
    signal = audio.inv_spectrogram(linear_output.T)
    signal /= np.max(np.abs(signal))
    path = join(args.result_dir, Xmel_name.replace('.npy', '.wav'))
    audio.save_wav(signal, path)

    happy_linear_output = happy_linear_output[0].data.cpu().numpy()
    signal = audio.inv_spectrogram(happy_linear_output.T)
    signal /= np.max(np.abs(signal))
    path = join(args.result_dir, happy_name.replace('.npy', '.wav'))
    audio.save_wav(signal, path)

    sad_linear_output = sad_linear_output[0].data.cpu().numpy()
    signal = audio.inv_spectrogram(sad_linear_output.T)
    signal /= np.max(np.abs(signal))
    path = join(args.result_dir, sad_name.replace('.npy', '.wav'))
    audio.save_wav(signal, path)