Exemple #1
0
def correlate(clip_filename, audio, output_filename):
    match = re.search("^\w+\.(mp3|wav|flac|ogg)$", clip_filename, re.IGNORECASE)
    save_video = True
    if match:
        low_quality_sound = AudioFileClip(clip_filename)
        save_video = False
    else:  # The file seems to be a video
        video_clip = VideoFileClip(clip_filename)
        low_quality_sound = video_clip.audio
    high_quality_sound = AudioFileClip(audio)
    audio_fps = max(low_quality_sound.fps, high_quality_sound.fps)
    lqsa = low_quality_sound.to_soundarray(nbytes=4, buffersize=1000, fps=audio_fps)
    hqsa = high_quality_sound.to_soundarray(nbytes=4, buffersize=1000, fps=audio_fps)
    sample_len = 10000
    sample_start = max(0, np.argmax(hqsa[:, 1]) - sample_len // 2)
    sample = hqsa[sample_start:sample_start+sample_len]
    correlation = np.correlate(lqsa[:, 1], sample[:, 1])
    offset = np.argmax(correlation) - sample_start
    good_sound = AudioArrayClip(mix(offset, lqsa, hqsa), fps=audio_fps)
    if save_video:
        video_clip.audio = good_sound
        video_clip.write_videofile(output_filename,
                                   codec='mpeg4',
                                   bitrate='4000000',
                                   audio_codec='pcm_s32le',
                                   audio_fps=audio_fps,
                                   #audio_bitrate='500000',
                                   preset='superslow',
                                   threads=4)
    else:
        good_sound.write_audiofile(output_filename,
                                   codec='pcm_s32le')
Exemple #2
0
def splitshift(sound, n):
    """
    Split stereo channels and pitchshift each of them.
    Then combine them and return an AudioArrayClip of the values.
    pitchshift() returns int16, not float, so divide by 32768 (max val of int16).
    """
    sound1 = pitchshift(sound[:, 0], n)
    sound2 = pitchshift(sound[:, 1], n)
    combined = np.column_stack([sound1, sound2]).astype(float) / 32768
    return AudioArrayClip(combined, fps=44100)
Exemple #3
0
    def to_video(self) -> moviepy.editor.VideoFileClip:
        """Recreates video clip from self-contained list
        of full frames.

        Returns:
            moviepy.editor.VideoFileClip: Reconstructed video clip
        """
        video_frames = [f.video_frame for f in self.frames]
        afs = [af.audio_frames for af in self.frames]
        audio_frames = np.array([item for sublist in afs for item in sublist])
        video = moviepy.editor.ImageSequenceClip(video_frames, self.video_fps)
        video.audio = AudioArrayClip(audio_frames, self.audio_fps)
        return video
Exemple #4
0
    def write_record(self):
        image_clip = ImageSequenceClip(self.imgsRec, fps=15) 

        print(self.imgsRec[0].shape, self.imgsRec[0].dtype, self.imgsRec[0].max())
        audios = np.concatenate(self.wavsRec, axis = 1)
        audios = (np.transpose(audios) / 32768.).astype(np.float32)
        audioclip = AudioArrayClip(audios, fps=44100)
        print(audios.shape, audios.dtype, audios.max())

        print(len(self.imgsRec) / 15, audios.shape[0] / 44100)

        video_clip = image_clip.set_audio(audioclip)
        video_clip.write_videofile("result.mp4", fps=15, temp_audiofile="temp-audio.m4a", remove_temp=True, codec="libx264", audio_codec="aac")
def speed_up(video_clip, speed):
    rate = 44100

    # Speed up video
    video_clip = video_clip.speedx(speed)

    # Determine pitch shift from speed
    shift = (1 - speed if speed >= 1 else (1 / speed) - 1) * 12

    # Fix audio pitch
    audio = video_clip.audio.to_soundarray(fps=rate).transpose()
    for i, channel in enumerate(audio):
        audio[i] = pitch_shift(channel, rate, shift)
    audio = audio.transpose()

    video_clip.audio = AudioArrayClip(audio, fps=rate)

    return video_clip
Exemple #6
0
def audio2wav(audio, duration=None, **kwargs):
    """
    The sound is converted into :epkg:`wav`
    and returned as an :epkg:`AudioArrayClip`.
    Le son est converti au format :epkg:`wav`.

    @param      audio           sound
    @param      duration        change the duration of the sound before converting it
    @param      kwargs          see `to_soundarray <https://zulko.github.io/moviepy/ref/AudioClip.html?
                                highlight=to_soundarray#moviepy.audio.AudioClip.AudioClip.to_soundarray>`_
    @return                     :epkg:`AudioArrayClip`
    """
    with AudioContext(audio) as audio_:
        if duration is not None:
            audio_ = audio_.set_duration(duration)
        wav = audio_.to_soundarray(**kwargs)
        fps = kwargs.get('fps', audio_.fps if hasattr(audio_, 'fps') else None)
        if fps is None:
            raise ValueError("fps cannot be None, 44100 is a proper value")
        return AudioArrayClip(wav, fps=fps)
Exemple #7
0
def npy_to_mp4(im_list, filename, fps=4, audio=None):
    """
    
    :param im_list:
    :param filename:
    :param fps:
    :param audio: an array frames x samples, containing the audio samples per each frame
    :return:
    """
    if isinstance(im_list, np.ndarray):
        im_list = list(im_list)
    if filename[-4:] != '.mp4':
        filename = filename + '.mp4'

    save_dir = '/'.join(str.split(filename, '/')[:-1])

    if save_dir and not os.path.exists(save_dir):
        print('creating directory: ', save_dir)
        os.mkdir(save_dir)

    clip = mpy.ImageSequenceClip(im_list, fps=fps)

    if audio is not None:
        # moviepy always expects stereo audio for some reason, repeating second axis to emulate stereo.
        if len(audio.shape) == 2:
            samples_per_frame = audio.shape[1]
        else:
            samples_per_frame = audio.shape[0] / len(im_list)

        audio = audio.reshape(-1, 1).repeat(2, 1)

        audio_clip = AudioArrayClip(audio, fps=samples_per_frame * fps)
        clip = clip.set_audio(audio_clip)

    clip.write_videofile(filename,
                         temp_audiofile=filename + '.m4a',
                         remove_temp=True,
                         codec="libx264",
                         audio_codec="aac",
                         verbose=False,
                         logger=None)
Exemple #8
0
def test_audioclip_io():
    # Generate a random audio clip of 4.989 seconds at 44100 Hz,
    # and save it to a file.
    input_array = np.random.random((220000, 2)) * 1.98 - 0.99
    clip = AudioArrayClip(input_array, fps=44100)
    clip.write_audiofile(os.path.join(TMP_DIR, "random.wav"))
    # Load the clip.
    # The loaded clip will be slightly longer because the duration is rounded
    # up to 4.99 seconds.
    # Verify that the extra frames are all zero, and the remainder is identical
    # to the original signal.
    clip = AudioFileClip(os.path.join(TMP_DIR, "random.wav"))
    output_array = clip.to_soundarray()
    np.testing.assert_array_almost_equal(output_array[:len(input_array)],
                                         input_array,
                                         decimal=4)
    assert (output_array[len(input_array):] == 0).all()
def merge_images_and_audio(images: DataList, audio: np.ndarray,
                           video_duration: float, sound_hz: int,
                           video_name: str):
    """
    Creates video with sound from image list and music.

    Args:
        images: List of images represented by a h x w x 3 numpy array.
        audio: A Numpy array representing the sound, of size Nx1 for mono, Nx2 for stereo.
        video_duration: Duration of the video in seconds (should be the same as the audio file).
        sound_hz: The hz of the audio file.
        video_name: The name of the resulting video file
    """
    # todo there is still a problem with the audio here
    # the audio should always contain two channels
    # then the hz should also work for mono and dual
    clip = ImageSequenceClip(images,
                             durations=[video_duration / len(images)] *
                             len(images))
    s = audio.reshape((len(audio), 2))  # transform it from (N) to (N, 2)
    audio = AudioArrayClip(s, sound_hz)
    clip = clip.set_audio(audio)
    clip.write_videofile(video_name, fps=len(images) / video_duration)
 def file_save_as_button_clicked(self, widget):
     filename = FileOp.choose_file(self, "save_as", file_types="audio", filename=self.filename)
     if filename:
         clip = AudioArrayClip(self.audio_samples.samples.T, fps=self.audio_samples.sample_rate)
         clip.write_audiofile(filename)
         self.set_filename(filename)
Exemple #11
0
def generate_with_model(model, dataloader_test, params, epoch=0, verbose=True):
    res_dir = params['results_dir']
    number_samples = 0
    max_num_samples = 2
    for bs_test, d in enumerate(dataloader_test.keys()):
        data = dataloader_test[d]
        img = data
        with torch.no_grad():
            img = Variable(img).cuda()
        out, cnn_model, cnn_out, hidden_enc = model(img, None)  # target=None

        input_data_tmp = img.cpu().data.numpy()
        outputs_data_tmp = out.squeeze().cpu().data.numpy()

        input_reshaped = np.reshape(input_data_tmp, [
            -1,
            np.shape(input_data_tmp)[2],
            np.shape(input_data_tmp)[3],
            np.shape(input_data_tmp)[1]
        ])
        out_reshaped = flatten_audio_with_params(
            outputs_data_tmp,
            params['sequence_length'])  # params['audio_n_prediction'])

        if verbose:
            print("input_data_tmp: {}, outputs_data_tmp: {}".format(
                np.shape(input_data_tmp), np.shape(outputs_data_tmp)))
            print("input_reshaped: {}, out_reshaped: {}".format(
                np.shape(input_reshaped), np.shape(out_reshaped)))

        if number_samples < max_num_samples:
            # Save audio, 16KHz
            from scipy.io.wavfile import write
            test_gen_audio_scaled = np.int16(
                out_reshaped / np.max(np.abs(out_reshaped)) * 32767)
            write(
                '{}test_gen_ep{}_s{}.wav'.format(res_dir, epoch,
                                                 number_samples), 16000,
                test_gen_audio_scaled[0])

            scaled = input_reshaped

            from skimage import color
            frame_arr = []
            frame_arr2 = []
            for frame in scaled:
                frame = utils.normalize(frame, min=-1, max=1)
                frame_rgb = color.hsv2rgb(frame)
                frame_arr.append(frame_rgb * 255)
                frame_arr2.append(frame_rgb)
            if verbose:
                print("frame_arr: {}, min: {}, max: {}".format(
                    np.shape(frame_arr), np.amin(frame_arr),
                    np.amax(frame_arr)))
                print("frame_arr2: {}, min: {}, max: {}".format(
                    np.shape(frame_arr2), np.amin(frame_arr2),
                    np.amax(frame_arr2)))
            clip = ImageSequenceClip(np.array(frame_arr),
                                     fps=10)  # 3-second clip, .tolist()
            clip.set_audio(AudioArrayClip(test_gen_audio_scaled, fps=16000))
            clip.write_videofile(
                '{}test_rgbInputWithGenAudio_ep{}_s{}.mp4'.format(
                    res_dir, epoch, number_samples),
                fps=10)  # export as video
            number_samples += 1
Exemple #12
0
def test_with_model(model,
                    dataloader_label_test,
                    dataloader_test,
                    dataloader_audio_test,
                    params,
                    target_bin=True,
                    epoch=0,
                    verbose=False):
    correct_samples_cnn = 0
    euc_dist_cnn = 0
    num_samples = 0
    test_size = len(dataloader_test.keys())
    res_dir = params['results_dir']
    number_samples = 0
    max_num_samples = 2
    for bs_test, (d, l, a) in enumerate(
            zip(dataloader_test.keys(), dataloader_label_test.keys(),
                dataloader_audio_test.keys())):
        label = dataloader_label_test[l]
        data = dataloader_test[d]
        audio = dataloader_audio_test[a]
        # print("Shapes1 - label: {}, data: {}".format(np.shape(label), np.shape(data)))
        img = data
        lab = label
        lab_audio = audio
        target = audio.reshape(-1,
                               np.shape(audio)[-1], params['output_size']).to(
                                   params['gpu_num'][0])  # labels.to(device)
        with torch.no_grad():
            img = Variable(img).cuda()
        #out, cnn_model, cnn_out, hidden_enc = model(img, np.array([[SOS_token]]))  # target
        out, cnn_model, cnn_out, hidden_enc = model(img, target)  # target
        if not target_bin:
            cnn_out = cnn_out.squeeze()
        # print("Shapes3 - label: {}, out: {}, img: {}".format(np.shape(lab), np.shape(output), np.shape(img)))

        input_data_tmp = img.cpu().data.numpy()
        target_data_tmp = target.squeeze().cpu().data.numpy()
        outputs_data_tmp = out.squeeze().cpu().data.numpy()
        #print("input_data_tmp: {}, target_data_tmp: {}, outputs_data_tmp: {}".
        #      format(np.shape(input_data_tmp), np.shape(target_data_tmp), np.shape(outputs_data_tmp)))

        # input_reshaped = np.reshape(input_data, [np.shape(input_data)[1], np.shape(input_data)[0]])
        input_reshaped = np.reshape(input_data_tmp, [
            -1,
            np.shape(input_data_tmp)[2],
            np.shape(input_data_tmp)[3],
            np.shape(input_data_tmp)[1]
        ])

        target_reshaped = flatten_audio_with_params(
            target_data_tmp,
            params['sequence_length'])  #params['audio_n_prediction'])

        out_reshaped = flatten_audio_with_params(
            outputs_data_tmp,
            params['sequence_length'])  #params['audio_n_prediction'])

        #print("input_reshaped: {}, out_reshaped: {}, target_reshaped: {}".
        #      format(np.shape(input_reshaped), np.shape(out_reshaped), np.shape(target_reshaped)))

        if number_samples < max_num_samples:
            # Save audio, 16KHz
            from scipy.io.wavfile import write
            scaled = -1.0 + (1.0 - (-1.0)) * (
                target_reshaped - np.amin(target_reshaped)) / (
                    np.amax(target_reshaped) - np.amin(target_reshaped))
            test_target_audio_scaled = np.int16(scaled /
                                                np.max(np.abs(scaled)) * 32767)
            write(
                '{}test_target_ep{}_s{}.wav'.format(res_dir, epoch,
                                                    number_samples), 16000,
                test_target_audio_scaled[0])

            test_gen_audio_scaled = np.int16(
                out_reshaped / np.max(np.abs(out_reshaped)) * 32767)
            write(
                '{}test_gen_ep{}_s{}.wav'.format(res_dir, epoch,
                                                 number_samples), 16000,
                test_gen_audio_scaled[0])

            # TODO: save with target and generated audio (RGB and HSV versions, total 4 videos)
            # TODO: save as video
            #scaled = -1.0 + (1.0 - (-1.0)) * (input_reshaped - np.amin(input_reshaped)) / (
            #    np.amax(input_reshaped) - np.amin(input_reshaped))
            scaled = input_reshaped

            from skimage import color
            frame_arr = []
            frame_arr2 = []
            for frame in scaled:
                frame = utils.normalize(frame, min=-1, max=1)
                frame_rgb = color.hsv2rgb(frame)
                frame_arr.append(frame_rgb * 255)
                frame_arr2.append(frame_rgb)
            print("frame_arr: {}, min: {}, max: {}".format(
                np.shape(frame_arr), np.amin(frame_arr), np.amax(frame_arr)))
            print("frame_arr2: {}, min: {}, max: {}".format(
                np.shape(frame_arr2), np.amin(frame_arr2),
                np.amax(frame_arr2)))
            clip = ImageSequenceClip(np.array(frame_arr),
                                     fps=10)  # 3-second clip, .tolist()
            clip.set_audio(AudioArrayClip(test_target_audio_scaled, fps=16000))
            clip.write_videofile(
                '{}test_rgbInputWithTargetAudio_ep{}_s{}.mp4'.format(
                    res_dir, epoch, number_samples),
                fps=10)  # export as video
            # .set_audio(AudioClip)
            # TODO: transform from HSV to RGB and save it as video
            # img_rgb = matplotlib.colors.hsv_to_rgb(scaled)
            # clip = ImageSequenceClip(np.array(img_rgb), fps=10)  # 3-second clip, .tolist()
            # clip.set_audio(AudioArrayClip(test_target_audio_scaled, fps=16000))
            # clip.write_videofile('{}test_rgbInputWithTargetAudio2_ep{}_s{}.mp4'.format(res_dir, epoch, number_samples),
            #                      fps=10)  # export as video

            number_samples += 1
        '''
        for i in range(0, min(2, np.shape(target_data_tmp)[0])):
            input_data = input_data_tmp[i]
            target_data = target_data_tmp[i]
            outputs_data = outputs_data_tmp[i]
            print("input_data: {}, target_data: {}, outputs_data: {}".
                  format(np.shape(input_data), np.shape(target_data), np.shape(outputs_data)))

            # input_reshaped = np.reshape(input_data, [np.shape(input_data)[1], np.shape(input_data)[0]])
            input_reshaped = np.reshape(input_data, [np.shape(input_data)[1], np.shape(input_data)[2],
                                                     np.shape(input_data)[0]])

            target_reshaped = np.reshape(target_data, [np.shape(target_data)[1], np.shape(target_data)[0]])
            target_reshaped = flatten_audio_with_params(target_reshaped, params['sequence_length'],
                                                        params['audio_n_prediction'])

            out_reshaped = np.reshape(outputs_data, [np.shape(outputs_data)[1], np.shape(outputs_data)[0]])
            out_reshaped = flatten_audio_with_params(out_reshaped, params['sequence_length'],
                                                     params['audio_n_prediction'])

            # print(np.shape(out_reshaped))
            print("input_reshaped: {}, out_reshaped: {}, target_reshaped: {}".
                  format(np.shape(input_reshaped), np.shape(out_reshaped), np.shape(target_reshaped)))

            # Save audio, 16KHz
            from scipy.io.wavfile import write
            scaled = -1.0 + (1.0 - (-1.0)) * (target_reshaped - np.min(target_reshaped)) / (
                np.max(target_reshaped) - np.min(target_reshaped))
            test_target_audio_scaled = np.int16(scaled / np.max(np.abs(scaled)) * 32767)
            write('{}test_target_ep{}_{}.wav'.format(res_dir, epoch, i), 16000, test_target_audio_scaled[0])

            test_gen_audio_scaled = np.int16(out_reshaped / np.max(np.abs(out_reshaped)) * 32767)
            write('{}test_gen_ep{}_{}.wav'.format(res_dir, epoch, i), 16000, test_gen_audio_scaled[0])

            # TODO: save with target and generated audio (RGB and HSV versions, total 4 videos)
            # TODO: save as video
            scaled = -1.0 + (1.0 - (-1.0)) * (input_reshaped - np.amin(input_reshaped)) / (
                    np.amax(input_reshaped) - np.amin(input_reshaped))
            # imsave('{}test_input_ep{}_{}.jpg'.format(res_dir, epoch, i), scaled)
            def make_frame_rgb(t):
                """ returns a numpy array of the frame at time t """
                frame_for_time_t = scaled[t, :, :, :]
                return frame_for_time_t
            clip = VideoClip(make_frame_rgb, duration=3)  # 3-second clip
            # clip.set_audio(AudioFileClip(test_target_audio_scaled, fps=16000))
            clip.write_videofile('{}test_inputWithTargetAudio_ep{}_{}.mp4'.format(res_dir, epoch, i), fps=10)  # export as video
            # .set_audio(AudioClip)
            # TODO: transform from HSV to RGB and save it as video
            # img_rgb = matplotlib.colors.hsv_to_rgb(img_hsv.reshape([100, 100, 3]))
        '''

        # TODO: calculate NLL of generated and target audio

        # CNN accuracy
        pic = cnn_out.cpu().data
        gen_out = pic.numpy()
        lab = lab.numpy()
        # Performance evaluation by comparing target and obtained output
        for i, c in zip(lab, gen_out):
            # Accuracy
            if target_bin:
                i_max_idx = np.argmax(i)
                c_max_idx = np.argmax(c)
            else:
                i_max_idx = i
                c_max_idx = round(c)
            if i_max_idx == c_max_idx:
                correct_samples_cnn += 1

            # Euclidian distance:
            # print("Shapes = i: {}, c: {}".format(np.shape(i), np.shape(c)))
            euc_dist_cnn += np.linalg.norm(i - c)
            num_samples += 1
        if verbose:
            print("[Tested {}%] acc: {}%, euc {}".format(
                round(100 * (bs_test + 1) / test_size, 2),
                round(100 * correct_samples_cnn / num_samples, 2),
                round(euc_dist_cnn / num_samples, 2)))
    correct_samples_perc_cnn = 100 * correct_samples_cnn / num_samples
    euc_dist_mean_cnn = euc_dist_cnn / num_samples

    return correct_samples_perc_cnn, euc_dist_mean_cnn
Exemple #13
0
def prepare_audio(file_name, rate=44100, speedup=1.3):
    audio = AudioFileClip(file_name)
    data = audio.to_soundarray(fps=rate)
    data = speedup_audio(data, speedup)
    return AudioArrayClip(data, fps=rate)
Exemple #14
0
starting_times = (9, 25, 40, 52, 65, 77, 89, 121, 137, 157, 170, 184)
# times in seconds when each note starts in the input video
length = 8  # length of the generated clips
octaves = (0, 9)  # range of octaves
resolution = (600, 800)  # video resolution of the outputs

notes = ['A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G',
         'G#']  # name of the notes

src_clips = [VideoFileClip(input_file).\
             subclip(start, start + length).\
             volumex(0.6)
             for start in starting_times]
# this list contains the clips for each note in the input video

for n in range(octaves[0] * 12, octaves[1] * 12):
    note_name = f"{n // 12}{notes[(n % 12)]}"
    clip = src_clips[(n - 3) % 12].copy()
    # the '-3' is because notes starts with 'C' but during the generation they starts from 'A'
    trans = (((n - 3) // 12) - 5) * 12  # transposition offset
    clip.audio = AudioArrayClip(
        np.transpose(
            np.stack([
                pitch_shift(  # this function does the trick
                    np.transpose(clip.audio.to_soundarray())[channel],
                    48000,  # it's the sampling of the source video
                    n_steps=trans) for channel in (0, 1)
            ])),  # (0, 1) because the source is stereo
        fps=48000)
    clip.write_videofile(f"{note_name}.mp4")
Exemple #15
0
def frames_to_audio(frames: np.ndarray, fps: float):
    return AudioArrayClip(frames, fps)
Exemple #16
0
    def __init__(
        self,
        song,
        path_video: str,
        artist_name: str,
        track_name: str,
        music_bpm: float,
        drop_beats=None,
        square=True,
    ):
        """Generic video object.
        This class contains all parameters common to all styles of videos to be generated.

        :param song: Song
        :param artist_name: Artist name
        :param track_name: Track name
        :param music_bpm: BPM of the music
        :param square: Set the size of the clip as "square"
        """

        # Size

        self.width = None
        self.height = None

        self.set_params()

        # Video

        self.main_clip = VideoFileClip(path_video, fps_source="fps")
        self.bpm_video = None
        self.drop_beats = drop_beats

        if square:

            self.main_clip = self.transform_squared_size(self.main_clip,
                                                         prop=0.8)

        self.background_clips = []

        # Song

        self.song = song
        self.audioclip = AudioArrayClip(self.song.waveform.reshape(-1, 2),
                                        fps=self.song.sr)

        # Rhythm

        self.music_bpm = music_bpm
        self.duration = 60 / self.music_bpm

        # Informations to display

        self.artist_name = artist_name
        self.track_name = track_name

        self.text_assets = TextAssets(
            self.width,
            self.height,
            self.artist_name,
            self.track_name,
            self.duration,
            self.drop_beats,
        )