Example #1
0
    def validation_for_A_dir(self):
        num_mcep = 24
        sampling_rate = 16000
        frame_period = 5.0
        n_frames = 128
        validation_A_dir = self.validation_A_dir
        output_A_dir = self.output_A_dir

        print("Generating Test Data B from A...")
        for file in os.listdir(validation_A_dir):
            filePath = os.path.join(validation_A_dir, file)
            wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True)
            wav = preprocess.wav_padding(wav=wav,
                                         sr=sampling_rate,
                                         frame_period=frame_period,
                                         multiple=4)
            f0, timeaxis, sp, ap = preprocess.world_decompose(
                wav=wav, fs=sampling_rate, frame_period=frame_period)
            f0_converted = preprocess.pitch_conversion(
                f0=f0,
                mean_log_src=self.log_f0s_mean_A,
                std_log_src=self.log_f0s_std_A,
                mean_log_target=self.log_f0s_mean_B,
                std_log_target=self.log_f0s_std_B)
            coded_sp = preprocess.world_encode_spectral_envelop(
                sp=sp, fs=sampling_rate, dim=num_mcep)
            coded_sp_transposed = coded_sp.T
            coded_sp_norm = (coded_sp_transposed -
                             self.coded_sps_A_mean) / self.coded_sps_A_std
            coded_sp_norm = np.array([coded_sp_norm])

            if torch.cuda.is_available():
                coded_sp_norm = torch.from_numpy(coded_sp_norm).cuda().float()
            else:
                coded_sp_norm = torch.from_numpy(coded_sp_norm).float()

            coded_sp_converted_norm = self.generator_A2B(coded_sp_norm)
            coded_sp_converted_norm = coded_sp_converted_norm.cpu().detach(
            ).numpy()
            coded_sp_converted_norm = np.squeeze(coded_sp_converted_norm)
            coded_sp_converted = coded_sp_converted_norm * \
                self.coded_sps_B_std + self.coded_sps_B_mean
            coded_sp_converted = coded_sp_converted.T
            coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
            decoded_sp_converted = preprocess.world_decode_spectral_envelop(
                coded_sp=coded_sp_converted, fs=sampling_rate)
            wav_transformed = preprocess.world_speech_synthesis(
                f0=f0_converted,
                decoded_sp=decoded_sp_converted,
                ap=ap,
                fs=sampling_rate,
                frame_period=frame_period)
            librosa.output.write_wav(path=os.path.join(output_A_dir,
                                                       os.path.basename(file)),
                                     y=wav_transformed,
                                     sr=sampling_rate)
        print("finish!")
Example #2
0
def process_file(filePath):
    num_mcep = 24
    sampling_rate = 16000
    frame_period = 5.0
    n_frames = 128
    wav, _ = librosa.load(filePath, sr=sampling_rate, mono=True)
    wav = preprocess.wav_padding(wav=wav,
                                sr=sampling_rate,
                                frame_period=frame_period,
                                multiple=4)
    f0, timeaxis, sp, ap = preprocess.world_decompose(
            wav=wav, fs=sampling_rate, frame_period=frame_period)
    coded_sp = preprocess.world_encode_spectral_envelop(
            sp=sp, fs=sampling_rate, dim=num_mcep)
    return coded_sp, f0
def test(filename):
    wav, _ = librosa.load(filename, sr=hp.rate)
    f0, timeaxis, sp, ap = world_decompose(wav, hp.rate)
    f0_converted = pitch_conversion(f0, log_f0s_mean_A, log_f0s_std_A,
                                    log_f0s_mean_B, log_f0s_std_B)
    coded_sp = world_encode_spectral_envelop(sp, hp.rate, hp.num_mceps)
    coded_sp_transposed = coded_sp.T
    coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std
    coded_sp_norm = seg_and_pad(coded_sp_norm, hp.n_frames)

    wav_forms = []
    for i, sp_norm in enumerate(coded_sp_norm):
        sp_norm = np.expand_dims(sp_norm, axis=-1)
        coded_sp_converted_norm = infer(sp_norm)
        coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
        coded_sp_converted = np.array(coded_sp_converted, dtype=np.float64).T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decode_sp_converted = world_decode_spectral_envelop(
            coded_sp_converted, hp.rate)
        if len(f0) < (i + 1) * hp.output_size:
            decode_sp_converted = decode_sp_converted[:len(f0) %
                                                      hp.output_size]
            f0_piece = f0_converted[i * hp.output_size:i * hp.output_size +
                                    len(f0) % hp.output_size]
            ap_piece = ap[i * hp.output_size:i * hp.output_size +
                          len(f0) % hp.output_size]
            wav_transformed = world_speech_synthesis(f0_piece,
                                                     decode_sp_converted,
                                                     ap_piece, hp.rate,
                                                     hp.duration)
            wav_forms.append(wav_transformed)
            break
        else:
            f0_piece = f0_converted[i * hp.output_size:(i + 1) *
                                    hp.output_size]
            ap_piece = ap[i * hp.output_size:(i + 1) * hp.output_size]

        wav_transformed = world_speech_synthesis(f0_piece, decode_sp_converted,
                                                 ap_piece, hp.rate,
                                                 hp.duration)
        wav_forms.append(wav_transformed)

    wav_forms = np.concatenate(wav_forms)
    wav_forms = np.expand_dims(wav_forms, axis=-1)
    wav_forms = np.expand_dims(wav_forms, axis=0)

    return wav_forms
Example #4
0
def world_encode_data_toSave(num_mcep, hdf5_dir, wav_dir, sr, frame_period=5.0, coded_dim24=24, coded_dim36=36):

    f0s = list()
    timeaxes = list()
    sps = list()
    aps = list()
    coded_sps24 = list()
    coded_sps36 = list()
    coded_sps32 = list()

    for file in os.listdir(wav_dir):
        file_path = os.path.join(wav_dir, file)
        wav, _ = librosa.load(file_path, sr=sr, mono=True)

        f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sr, frame_period=frame_period)
        coded_sp24 = world_encode_spectral_envelop(sp=sp, fs=sr, dim=coded_dim24)
        coded_sp36 = world_encode_spectral_envelop(sp=sp, fs=sr, dim=coded_dim36)
        coded_sp32 = world_encode_spectral_envelop(sp=sp, fs=sr, dim=32)

        f0s.append(f0)
        timeaxes.append(timeaxis)
        sps.append(sp)
        aps.append(ap)
        coded_sps24.append(coded_sp24)
        coded_sps36.append(coded_sp36)
        coded_sps32.append(coded_sp32)

        # file write
        item = {"f0": f0, "timeaxe": timeaxis, "ap": ap, "sp": sp, "coded24": coded_sp24, "coded36": coded_sp36, "coded32": coded_sp32}
        hdf5_file_name = os.path.join(hdf5_dir, os.path.splitext(file)[0]+".h5")
        hdf5_write(hdf5_file_name, item)

    assert num_mcep == 24 or num_mcep == 36 or num_mcep == 32, "spectral envelop dimension misatch"
    if num_mcep == 24:
        coded_sps = coded_sps24
    elif num_mcep == 36:
        coded_sps = coded_sps36
    elif num_mcep == 32:
        coded_sps = coded_sps32

    return f0s, timeaxes, sps, aps, coded_sps
Example #5
0
def world_encode_data_toSave_spec(num_mcep, hdf5_dir, wav_dir, sr, frame_period=5.0, coded_dim24=24, coded_dim36=36):
    # 1. 32dim MCEP         [32  x frame]
    # 2. 512dim cheaptrick  [512 x frame]
    # 3. 512dim spectrogram [512 x frame]
    f0s = list()
    timeaxes = list()
    sps = list()
    aps = list()
    coded_sps24 = list()
    coded_sps36 = list()
    coded_sps32 = list()
    spectrograms = list()

    def calc_spec_wav(wav, f0):

        sr = 44000
        duration = 0.005
        hop_size = int(sr * duration) # 44000 * 0.005 = 80 sample
        spectrograms = list()

        for i in range(wav.shape[0] // (hop_size) + 1):
            start = i * hop_size
            if f0[i] == 0:
                segment_wav = wav[start : start+1024]
            else:
                segment_wav = wav[start : start+int(3 * 1 / f0[i] * 44000)]
            fft_size = 1024
            # D = np.abs(librosa.stft(segment_wav, n_fft=fft_size, hop_length=segment_wav.shape[0]*2))
            if segment_wav.shape[0] == 0:
                D = np.zeros((513))
            else:
                D = np.abs(librosa.stft(segment_wav, n_fft=fft_size, hop_length=2048))[:, 0]
            magnitude = D#[:-1]
            spectrograms.append(magnitude)

        return spectrograms

    for file in os.listdir(wav_dir):
        print("----")
        print(file)
        file_path = os.path.join(wav_dir, file)
        wav, _ = librosa.load(file_path, sr=sr, mono=True)

        f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sr, frame_period=frame_period)
        spectrogram = calc_spec_wav(wav, f0)

        coded_sp24 = world_encode_spectral_envelop(sp=sp, fs=sr, dim=coded_dim24)
        coded_sp36 = world_encode_spectral_envelop(sp=sp, fs=sr, dim=coded_dim36)
        coded_sp32 = world_encode_spectral_envelop(sp=sp, fs=sr, dim=32)
        spectrogram = np.array(spectrogram)

        f0s.append(f0)
        timeaxes.append(timeaxis)
        sps.append(sp)
        aps.append(ap)
        coded_sps24.append(coded_sp24)
        coded_sps36.append(coded_sp36)
        coded_sps32.append(coded_sp32)
        spectrograms.append(spectrogram)

        # file write
        item = {"f0": f0, "timeaxe": timeaxis, "ap": ap, "sp": sp, "coded24": coded_sp24, "coded36": coded_sp36,
                "coded32": coded_sp32, "spectrogram": spectrogram}
        hdf5_file_name = os.path.join(hdf5_dir, os.path.splitext(file)[0]+".h5")
        hdf5_write(hdf5_file_name, item)

    assert num_mcep == 24 or num_mcep == 36 or num_mcep == 32, "spectral envelop dimension misatch"
    if num_mcep == 24:
        coded_sps = coded_sps24
    elif num_mcep == 36:
        coded_sps = coded_sps36
    elif num_mcep == 32:
        coded_sps = coded_sps32

    return f0s, timeaxes, sps, aps, coded_sps, spectrograms
Example #6
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='hdf5 DB write')

    wav_file_default = '/root/onejin/S2SCycleGAN/data/debug/ma/arctic_a0010.wav'
    hdf5_file_default = '/root/onejin/S2SCycleGAN/data/debug_hdf5/ma/arctic_a0010.h5'
    parser.add_argument('--wav_file', type=str, help='Directory for A.', default=wav_file_default)
    parser.add_argument('--hdf_file', type=str, help='Directory for B.', default=hdf5_file_default)

    argv = parser.parse_args()
    wav_file = argv.wav_file
    hdf_file = argv.hdf_file

    sr = 44000
    wav, _ = librosa.load(wav_file, sr=sr, mono=True)

    f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sr, frame_period=5.0)
    def calc_spec_wav(wav, f0):

        sr = 44000
        duration = 0.005
        hop_size = int(sr * duration) # 44000 * 0.005 = 80 sample
        spectrograms = list()

        for i in range(wav.shape[0] // (hop_size) + 1):
            start = i * hop_size
            if f0[i] == 0:
                segment_wav = wav[start : start+1024]
            else:
                segment_wav = wav[start : start+int(3 * 1 / f0[i] * 44000)]
            fft_size = 1024
            # D = np.abs(librosa.stft(segment_wav, n_fft=fft_size, hop_length=segment_wav.shape[0]*2))
model = CycleGAN2()
latest = tf.train.latest_checkpoint(hp.weights_dir)
model.load_weights(latest)

print('Loading cached data...')
with open('./datasets/JSUT/jsut.p', 'rb') as f:
    coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = pickle.load(
        f)

with open('./datasets/target_voice/target_voice.p', 'rb') as f:
    coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = pickle.load(
        f)

wav, _ = librosa.load('./outputs/100002.wav', sr=hp.rate)
f0, timeaxis, sp, ap = world_decompose(wav, hp.rate)
f0_converted = pitch_conversion(f0, log_f0s_mean_A, log_f0s_std_A,
                                log_f0s_mean_B, log_f0s_std_B)
coded_sp = world_encode_spectral_envelop(sp, hp.rate, hp.num_mceps)
coded_sp_transposed = coded_sp.T
coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std
coded_sp_norm = seg_and_pad(coded_sp_norm, hp.n_frames)

wav_forms = []
for i, sp_norm in enumerate(coded_sp_norm):
    sp_norm = np.expand_dims(sp_norm, axis=-1)
    coded_sp_converted_norm = model([sp_norm, sp_norm])[1][0]
    coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
    coded_sp_converted = np.array(coded_sp_converted, dtype=np.float64).T
    coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
    decode_sp_converted = world_decode_spectral_envelop(
Example #8
0
def conversion(model_path, data_dir, output_dir, no_spec=False):

    sampling_rate = 16000
    num_mcep = 23
    frame_period = 5.0

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    model = EncDecGen(num_mfc_features=23, pre_train=None)
    model.load(filepath=model_path)

    for file in os.listdir(data_dir):

        try:

            wav = scwav.read(os.path.join(data_dir, file))
            wav = wav[1].astype(np.float64)
            wav = preproc.wav_padding(wav=wav, sr=sampling_rate, \
                    frame_period=frame_period, multiple=4)
            f0, sp, ap = preproc.world_decompose(wav=wav, \
                    fs=sampling_rate, frame_period=frame_period)
            code_sp = preproc.world_encode_spectral_envelop(sp, \
                                    sampling_rate, dim=num_mcep)

            z_idx = np.where(f0<10.0)[0]
            f0 = scisig.medfilt(f0, kernel_size=3)
            f0 = generate_interpolation(f0)
            f0 = smooth(f0, window_len=13)
            f0 = np.reshape(f0, (1,-1,1))
            code_sp = np.reshape(code_sp, (1,-1,num_mcep))

            code_sp = np.transpose(code_sp, axes=(0,2,1))
            f0 = np.transpose(f0, axes=(0,2,1))

            # Prediction
            _, f0_conv, code_sp_conv = model.test(input_mfc=code_sp, \
                                                  input_pitch=f0)
            
            code_sp_conv = np.transpose(code_sp_conv, axes=(0,2,1))

            f0_conv = np.asarray(np.reshape(f0_conv,(-1,)), np.float64)
            code_sp_conv = np.asarray(np.squeeze(code_sp_conv), np.float64)
            code_sp_conv = np.copy(code_sp_conv, order='C')
            sp_conv = preproc.world_decode_spectral_envelop(code_sp_conv, \
                                                            sampling_rate)
            f0_conv[z_idx] = 0.0
            
            if no_spec == True:
                ec = np.reshape(np.sqrt(np.sum(np.square(sp), axis=1)), (-1,1))
                ec_conv = np.reshape(np.sqrt(np.sum(np.square(sp_conv), axis=1)), \
                                     (-1,1))

                # Making sure silence remains silence
                sil_zone = np.where(ec<1e-10)[0]
                ec_conv[sil_zone] = 1e-10
                
                sp = np.divide(np.multiply(sp, ec_conv), ec)
                sp = np.copy(sp, order='C')
                
                wav_transformed = preproc.world_speech_synthesis(f0=f0_conv, \
                                    decoded_sp=sp, \
                                    ap=ap, fs=sampling_rate, \
                                    frame_period=frame_period)
            else:
                wav_transformed = preproc.world_speech_synthesis(f0=f0_conv, \
                                    decoded_sp=sp_conv, \
                                    ap=ap, fs=sampling_rate, \
                                    frame_period=frame_period)
            
            librosa.output.write_wav(os.path.join(output_dir, \
                    os.path.basename(file)), wav_transformed, sampling_rate)
            print("Reconstructed file "+os.path.basename(file))
        except Exception as ex:
            print(ex)
def train(emo_pair, train_dir, model_dir, model_name, \
            random_seed, validation_dir, output_dir, \
            pre_train=None, lambda_encoder=1, lambda_decoder=1, \
            lambda_generator=1):

    np.random.seed(random_seed)

    num_epochs = 1000
    mini_batch_size = 1
    encoder_learning_rate = 0.0001
    decoder_learning_rate = 0.0001
    generator_learning_rate = 0.0001

    sampling_rate = 16000
    num_mcep = 23
    frame_period = 5.0
    n_frames = 128

    lambda_encoder = lambda_encoder
    lambda_decoder = lambda_decoder
    lambda_generator = lambda_generator

    le_ld_lg = "le_"+str(lambda_encoder)+"_ld_"+str(lambda_decoder) \
                +"_lg_"+str(lambda_generator)+'_'+emo_pair

    logger_file = './log/' + le_ld_lg + '.log'

    if not os.path.exists('./log'):
        os.mkdir('./log')

    if os.path.exists(logger_file):
        os.remove(logger_file)

    logging.basicConfig(filename="./log/logger_"+le_ld_lg+".log", \
                            level=logging.DEBUG)

    logging.info("encoder_loss - L1")
    logging.info("decoder_loss - L1")
    logging.info("generator_loss - L1")

    logging.info("lambda_encoder - {}".format(lambda_encoder))
    logging.info("lambda_decoder - {}".format(lambda_decoder))
    logging.info("lambda_generator - {}".format(lambda_generator))

    if not os.path.isdir("./generated_pitch_spect/" + le_ld_lg):
        os.makedirs("./generated_pitch_spect/" + le_ld_lg)

    logging.info('Loading Data...')

    start_time = time.time()

    data_train = scio.loadmat(os.path.join(train_dir, 'momenta_train.mat'))
    data_valid = scio.loadmat(os.path.join(train_dir, 'momenta_valid.mat'))

    pitch_A_train = np.expand_dims(data_train['src_f0_feat'], axis=-1)
    pitch_B_train = np.expand_dims(data_train['tar_f0_feat'], axis=-1)
    mfc_A_train = data_train['src_mfc_feat']
    mfc_B_train = data_train['tar_mfc_feat']
    momenta_A2B_train = np.expand_dims(data_train['momenta_f0'], axis=-1)

    pitch_A_valid = np.expand_dims(data_valid['src_f0_feat'], axis=-1)
    pitch_B_valid = np.expand_dims(data_valid['tar_f0_feat'], axis=-1)
    mfc_A_valid = data_valid['src_mfc_feat']
    mfc_B_valid = data_valid['tar_mfc_feat']
    momenta_A2B_valid = np.expand_dims(data_valid['momenta_f0'], axis=-1)

    mfc_A_valid, pitch_A_valid, mfc_B_valid, pitch_B_valid, momenta_A2B_valid \
        = preproc.sample_data(mfc_A=mfc_A_valid, pitch_A=pitch_A_valid, \
                              mfc_B=mfc_B_valid, pitch_B=pitch_B_valid, \
                              momenta_A2B=momenta_A2B_valid)

    if validation_dir is not None:
        validation_output_dir = os.path.join(output_dir, le_ld_lg)
        if not os.path.exists(validation_output_dir):
            os.makedirs(validation_output_dir)

    end_time = time.time()
    time_elapsed = end_time - start_time

    logging.info('Loading Done.')

    logging.info('Time Elapsed for Data Preprocessing: %02d:%02d:%02d' % (time_elapsed // 3600, \
                                                                   (time_elapsed % 3600 // 60), \
                                                                   (time_elapsed % 60 // 1)))

    model = EncDecGen(
        num_mfc_features=23,
        pre_train=pre_train)  #use pre_train arg to provide trained model

    for epoch in range(1, num_epochs + 1):

        logging.info('Epoch: %d' % epoch)

        start_time_epoch = time.time()

        mfc_A, pitch_A, mfc_B, \
                pitch_B, momenta_A2B = preproc.sample_data(mfc_A=mfc_A_train, \
                                        pitch_A=pitch_A_train, mfc_B=mfc_B_train, \
                                        pitch_B=pitch_B_train, momenta_A2B=momenta_A2B_train)

        n_samples = mfc_A.shape[0]

        batch_enc_loss = list()
        batch_dec_loss = list()
        batch_gen_loss = list()
        batch_tot_loss = list()

        for i in range(n_samples // mini_batch_size):

            start = i * mini_batch_size
            end = (i + 1) * mini_batch_size

            encoder_loss, decoder_loss, generator_loss, \
            gen_momenta, gen_pitch, gen_mfc \
                = model.train(input_mfc_A=mfc_A[start:end], \
                            input_mfc_B=mfc_B[start:end], \
                            input_pitch_A=pitch_A[start:end], \
                            input_pitch_B=pitch_B[start:end], \
                            input_momenta_A2B=momenta_A2B[start:end], \
                            lambda_encoder=lambda_encoder, \
                            lambda_decoder=lambda_decoder, \
                            lambda_generator=lambda_generator, \
                            encoder_learning_rate=encoder_learning_rate, \
                            decoder_learning_rate=decoder_learning_rate, \
                            generator_learning_rate = generator_learning_rate)

            batch_enc_loss.append(encoder_loss)
            batch_dec_loss.append(decoder_loss)
            batch_gen_loss.append(generator_loss)
            batch_tot_loss.append(lambda_encoder*encoder_loss \
                    + lambda_decoder*decoder_loss + lambda_generator*generator_loss)

        model.save(directory=model_dir, filename=model_name)

        logging.info("Train Encoder Loss- {}".format(np.mean(batch_enc_loss)))
        logging.info("Train Decoder Loss- {}".format(np.mean(batch_dec_loss)))
        logging.info("Train Generator Loss- {}".format(
            np.mean(batch_gen_loss)))
        logging.info("Train Total Loss- {}".format(np.mean(batch_tot_loss)))

        # Getting results on validation set

        valid_enc_loss = list()
        valid_dec_loss = list()
        valid_gen_loss = list()
        valid_tot_loss = list()

        for i in range(mfc_A_valid.shape[0]):

            gen_momenta, gen_pitch, gen_mfc, \
            enc_loss, dec_loss, gen_loss, \
                = model.compute_test_loss(input_mfc_A=mfc_A_valid[i:i+1], \
                             input_pitch_A=pitch_A_valid[i:i+1], \
                             input_momenta_A2B=momenta_A2B_valid[i:i+1], \
                             input_mfc_B=mfc_B_valid[i:i+1], \
                             input_pitch_B=pitch_B_valid[i:i+1])

            valid_enc_loss.append(enc_loss)
            valid_dec_loss.append(dec_loss)
            valid_gen_loss.append(gen_loss)
            valid_tot_loss.append(lambda_encoder*enc_loss \
                    + lambda_decoder*dec_loss + lambda_generator*gen_loss)

            if epoch % 100 == 0:
                pylab.figure(figsize=(12, 12))
                pylab.plot(pitch_A_valid[i].reshape(-1, ),
                           label="Input Neutral")
                pylab.plot(pitch_B_valid[i].reshape(-1, ),
                           label="Target Angry")
                pylab.plot(gen_pitch.reshape(-1, ), label="Generated Angry")
                pylab.plot(momenta_A2B_valid[i].reshape(-1, ),
                           label="Target Momentum")
                pylab.plot(gen_momenta.reshape(-1, ),
                           label="Generated Momentum")
                pylab.legend(loc=1)
                pylab.title("Epoch " + str(epoch) + " example " + str(i + 1))
                pylab.savefig("./generated_pitch_spect/"+le_ld_lg+'/'+str(epoch)\
                                + "_"+str(i+1)+".png")
                pylab.close()

        logging.info("Valid Encoder Loss- {}".format(np.mean(valid_enc_loss)))
        logging.info("Valid Decoder Loss- {}".format(np.mean(valid_dec_loss)))
        logging.info("Valid Generator Loss- {}".format(
            np.mean(valid_gen_loss)))
        logging.info("Valid Total Loss- {}".format(np.mean(valid_tot_loss)))

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        logging.info('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, \
                (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1)))

        if validation_dir is not None:
            if epoch % 100 == 0:
                logging.info('Generating Validation Data B from A...')
                sys.stdout.flush()
                for file in sorted(os.listdir(validation_dir)):
                    try:
                        filepath = os.path.join(validation_dir, file)
                        wav = scwav.read(filepath)
                        wav = wav[1].astype(np.float64)
                        wav = preproc.wav_padding(wav=wav, sr=sampling_rate, \
                                frame_period=frame_period, multiple=4)
                        f0, sp, ap = preproc.world_decompose(wav=wav, \
                                fs=sampling_rate, frame_period=frame_period)
                        code_sp = preproc.world_encode_spectral_envelop(sp, \
                                                sampling_rate, dim=num_mcep)

                        z_idx = np.where(f0 < 10.0)[0]
                        f0 = scisig.medfilt(f0, kernel_size=3)
                        f0 = generate_interpolation(f0)
                        f0 = smooth(f0, window_len=13)
                        f0 = np.reshape(f0, (1, -1, 1))
                        code_sp = np.reshape(code_sp, (1, -1, num_mcep))

                        code_sp = np.transpose(code_sp, axes=(0, 2, 1))
                        f0 = np.transpose(f0, axes=(0, 2, 1))

                        # Prediction
                        _, f0_conv, code_sp_conv = model.test(input_mfc=code_sp, \
                                                              input_pitch=f0)

                        code_sp_conv = np.transpose(code_sp_conv,
                                                    axes=(0, 2, 1))

                        f0_conv = np.asarray(np.reshape(f0_conv, (-1, )),
                                             np.float64)
                        code_sp_conv = np.asarray(np.squeeze(code_sp_conv),
                                                  np.float64)
                        code_sp_conv = np.copy(code_sp_conv, order='C')
                        sp_conv = preproc.world_decode_spectral_envelop(code_sp_conv, \
                                                                        sampling_rate)

                        f0_conv[z_idx] = 0.0
                        wav_transformed = preproc.world_speech_synthesis(f0=f0_conv, \
                                            decoded_sp=sp_conv, \
                                            ap=ap, fs=sampling_rate, \
                                            frame_period=frame_period)
                        librosa.output.write_wav(os.path.join(validation_output_dir, \
                                os.path.basename(file)), wav_transformed, sampling_rate)
                        logging.info("Reconstructed file " +
                                     os.path.basename(file))
                    except Exception as ex:
                        logging.info(ex)
Example #10
0
def conversion(model_f0_path, model_mcep_path, mcep_nmz_path, data_dir,
               conversion_direction, output_dir, emo_pair):

    num_mceps = 24
    sampling_rate = 16000
    frame_period = 5.0

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    mcep_normalization_params = np.load(mcep_nmz_path)
    mcep_mean_A = mcep_normalization_params['mean_A']
    mcep_std_A = mcep_normalization_params['std_A']
    mcep_mean_B = mcep_normalization_params['mean_B']
    mcep_std_B = mcep_normalization_params['std_B']

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file in os.listdir(data_dir):

        try:

            filepath = os.path.join(data_dir, file)
            wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True)
            wav = preproc.wav_padding(wav = wav, sr = sampling_rate, \
                    frame_period = frame_period, multiple = 4)
            f0, sp, ap = preproc.world_decompose(wav = wav, \
                    fs = sampling_rate, frame_period = frame_period)

            coded_sp = preproc.world_encode_spectral_envelope(sp = sp, \
                    fs = sampling_rate, dim = num_mceps)

            coded_sp_f0 = preproc.world_encode_spectral_envelope(sp=sp, \
                    fs=sampling_rate, dim=23)

            coded_sp_transposed = coded_sp.T

            if conversion_direction == 'A2B':

                coded_sp_norm = (coded_sp_transposed -
                                 mcep_mean_A) / mcep_std_A

                # test mceps
                coded_sp_converted_norm = mcep_conversion(model_mcep_path=model_mcep_path, \
                                                features=np.array([coded_sp_norm]), \
                                                direction=conversion_direction)
                # test f0:
                f0 = scisig.medfilt(f0, kernel_size=3)
                z_idx = np.where(f0 < 10.0)[0]

                f0 = generate_interpolation(f0)
                f0 = smooth(f0, window_len=13)
                f0 = np.reshape(f0, (1, 1, -1))

                coded_sp_f0 = np.expand_dims(coded_sp_f0, axis=0)
                coded_sp_f0 = np.transpose(coded_sp_f0, (0, 2, 1))

                f0_converted = f0_conversion(model_f0_path=model_f0_path,
                                             input_mfc=coded_sp_f0,
                                             input_pitch=f0,
                                             direction='A2B')

                f0_converted = np.asarray(np.reshape(f0_converted, (-1, )),
                                          np.float64)
                f0_converted[z_idx] = 0.0
                f0_converted = np.ascontiguousarray(f0_converted)

            else:
                raise Exception("Please specify A2B as conversion direction")

            coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B
            coded_sp_converted = coded_sp_converted.T
            coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
            decoded_sp_converted = preproc.world_decode_spectral_envelope(coded_sp=coded_sp_converted, \
                    fs=sampling_rate)
            wav_transformed = preproc.world_speech_synthesis(f0=f0_converted, \
                    decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, \
                    frame_period=frame_period)
            librosa.output.write_wav(os.path.join(output_dir, \
                    os.path.basename(file)), wav_transformed, sampling_rate)

            print("Processed " + filepath)
        except Exception as ex:
            print(ex)