Esempio n. 1
0
    def __convert_single_only_au_AutoVC_format_to_dataset__(
            self, filename, build_train_dataset=True):
        """
        Convert a single file (only audio in AutoVC embedding format) to numpy arrays
        :param filename:
        :param is_map_to_std_face:
        :return:
        """

        global_clip_index, video_name = filename

        # audio_file = os.path.join(self.src_dir, 'raw_wav', '{}.wav'.
        #                           format(video_name[:-4]))
        audio_file = os.path.join(
            self.src_dir, 'raw_wav',
            '{:05d}_{}_audio.wav'.format(global_clip_index, video_name[:-4]))
        if (not build_train_dataset):
            import shutil
            audio_file = os.path.join(
                self.src_dir, 'raw_wav',
                '{:05d}_{}_audio.wav'.format(global_clip_index,
                                             video_name[:-4]))
            shutil.copy(
                os.path.join(self.src_dir, 'test_wav_files', video_name),
                audio_file)

        sound = AudioSegment.from_file(audio_file, "wav")
        normalized_sound = match_target_amplitude(sound, -20.0)
        normalized_sound.export(audio_file, format='wav')

        from src.autovc.retrain_version.vocoder_spec.extract_f0_func import extract_f0_func_audiofile
        S, f0_norm = extract_f0_func_audiofile(audio_file, 'M')

        from src.autovc.utils import quantize_f0_interp
        f0_onehot = quantize_f0_interp(f0_norm)

        from thirdparty.resemblyer_util.speaker_emb import get_spk_emb
        mean_emb, _ = get_spk_emb(audio_file)

        return S, mean_emb, f0_onehot
    'examples_cartoon/{}_face_close_mouth.txt'.format(DEMO_CH))
''' STEP 3: Generate audio data as input to audio branch '''
au_data = []
au_emb = []
ains = glob.glob1('examples', '*.wav')
ains = [item for item in ains if item is not 'tmp.wav']
ains.sort()
for ain in ains:
    os.system(
        'ffmpeg -y -loglevel error -i examples/{} -ar 16000 examples/tmp.wav'.
        format(ain))
    shutil.copyfile('examples/tmp.wav', 'examples/{}'.format(ain))

    # au embedding
    from thirdparty.resemblyer_util.speaker_emb import get_spk_emb
    me, ae = get_spk_emb('examples/{}'.format(ain))
    au_emb.append(me.reshape(-1))

    print('Processing audio file', ain)
    c = AutoVC_mel_Convertor('examples')
    au_data_i = c.convert_single_wav_to_autovc_input(
        audio_filename=os.path.join('examples', ain),
        autovc_model_path=opt_parser.load_AUTOVC_name)
    au_data += au_data_i
    # os.remove(os.path.join('examples', 'tmp.wav'))
if (os.path.isfile('examples/tmp.wav')):
    os.remove('examples/tmp.wav')

fl_data = []
rot_tran, rot_quat, anchor_t_shape = [], [], []
for au, info in au_data:
Esempio n. 3
0
    def convert_single_wav_to_autovc_input(self, audio_filename,
                                           autovc_model_path):
        def pad_seq(x, base=32):
            len_out = int(base * ceil(float(x.shape[0]) / base))
            len_pad = len_out - x.shape[0]
            assert len_pad >= 0
            return np.pad(x, ((0, len_pad), (0, 0)), 'constant'), len_pad

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        G = Generator(16, 256, 512, 16).eval().to(device)

        g_checkpoint = torch.load(autovc_model_path, map_location=device)
        G.load_state_dict(g_checkpoint['model'])

        emb = np.loadtxt('src/autovc/retrain_version/obama_emb.txt')
        emb_trg = torch.from_numpy(
            emb[np.newaxis, :].astype('float32')).to(device)

        aus = []
        audio_file = audio_filename

        sound = AudioSegment.from_file(audio_file, "wav")
        normalized_sound = match_target_amplitude(sound, -20.0)
        normalized_sound.export(audio_file, format='wav')

        from src.autovc.retrain_version.vocoder_spec.extract_f0_func import extract_f0_func_audiofile
        x_real_src, f0_norm = extract_f0_func_audiofile(audio_file, 'F')
        from src.autovc.utils import quantize_f0_interp
        f0_org_src = quantize_f0_interp(f0_norm)
        from thirdparty.resemblyer_util.speaker_emb import get_spk_emb
        emb, _ = get_spk_emb(audio_file)
        ''' normal length version '''
        # x_real, len_pad = pad_seq(x_real_src.astype('float32'))
        # f0_org, _ = pad_seq(f0_org_src.astype('float32'))
        # x_real = torch.from_numpy(x_real[np.newaxis, :].astype('float32')).to(device)
        # emb_org = torch.from_numpy(emb[np.newaxis, :].astype('float32')).to(device)
        # f0_org = torch.from_numpy(f0_org[np.newaxis, :].astype('float32')).to(device)
        # print('source shape:', x_real.shape, emb_org.shape, emb_trg.shape, f0_org.shape)
        #
        # with torch.no_grad():
        #     x_identic, x_identic_psnt, code_real = G(x_real, emb_org, f0_org, emb_trg, f0_org)
        # print('converted shape:', x_identic_psnt.shape, code_real.shape)
        ''' long split version '''
        l = x_real_src.shape[0]
        x_identic_psnt = []
        step = 4096
        for i in range(0, l, step):
            x_real = x_real_src[i:i + step]
            f0_org = f0_org_src[i:i + step]

            x_real, len_pad = pad_seq(x_real.astype('float32'))
            f0_org, _ = pad_seq(f0_org.astype('float32'))
            x_real = torch.from_numpy(
                x_real[np.newaxis, :].astype('float32')).to(device)
            emb_org = torch.from_numpy(
                emb[np.newaxis, :].astype('float32')).to(device)
            # emb_trg = torch.from_numpy(emb[np.newaxis, :].astype('float32')).to(device)
            f0_org = torch.from_numpy(
                f0_org[np.newaxis, :].astype('float32')).to(device)
            print('source shape:', x_real.shape, emb_org.shape, emb_trg.shape,
                  f0_org.shape)

            with torch.no_grad():
                x_identic, x_identic_psnt_i, code_real = G(
                    x_real, emb_org, f0_org, emb_trg, f0_org)
                x_identic_psnt.append(x_identic_psnt_i)

        x_identic_psnt = torch.cat(x_identic_psnt, dim=1)
        print('converted shape:', x_identic_psnt.shape, code_real.shape)

        if len_pad == 0:
            uttr_trg = x_identic_psnt[0, :, :].cpu().numpy()
        else:
            uttr_trg = x_identic_psnt[0, :-len_pad, :].cpu().numpy()

        aus.append((uttr_trg, (0, audio_filename, emb)))

        return aus