def correlate(clip_filename, audio, output_filename): match = re.search("^\w+\.(mp3|wav|flac|ogg)$", clip_filename, re.IGNORECASE) save_video = True if match: low_quality_sound = AudioFileClip(clip_filename) save_video = False else: # The file seems to be a video video_clip = VideoFileClip(clip_filename) low_quality_sound = video_clip.audio high_quality_sound = AudioFileClip(audio) audio_fps = max(low_quality_sound.fps, high_quality_sound.fps) lqsa = low_quality_sound.to_soundarray(nbytes=4, buffersize=1000, fps=audio_fps) hqsa = high_quality_sound.to_soundarray(nbytes=4, buffersize=1000, fps=audio_fps) sample_len = 10000 sample_start = max(0, np.argmax(hqsa[:, 1]) - sample_len // 2) sample = hqsa[sample_start:sample_start+sample_len] correlation = np.correlate(lqsa[:, 1], sample[:, 1]) offset = np.argmax(correlation) - sample_start good_sound = AudioArrayClip(mix(offset, lqsa, hqsa), fps=audio_fps) if save_video: video_clip.audio = good_sound video_clip.write_videofile(output_filename, codec='mpeg4', bitrate='4000000', audio_codec='pcm_s32le', audio_fps=audio_fps, #audio_bitrate='500000', preset='superslow', threads=4) else: good_sound.write_audiofile(output_filename, codec='pcm_s32le')
def splitshift(sound, n): """ Split stereo channels and pitchshift each of them. Then combine them and return an AudioArrayClip of the values. pitchshift() returns int16, not float, so divide by 32768 (max val of int16). """ sound1 = pitchshift(sound[:, 0], n) sound2 = pitchshift(sound[:, 1], n) combined = np.column_stack([sound1, sound2]).astype(float) / 32768 return AudioArrayClip(combined, fps=44100)
def to_video(self) -> moviepy.editor.VideoFileClip: """Recreates video clip from self-contained list of full frames. Returns: moviepy.editor.VideoFileClip: Reconstructed video clip """ video_frames = [f.video_frame for f in self.frames] afs = [af.audio_frames for af in self.frames] audio_frames = np.array([item for sublist in afs for item in sublist]) video = moviepy.editor.ImageSequenceClip(video_frames, self.video_fps) video.audio = AudioArrayClip(audio_frames, self.audio_fps) return video
def write_record(self): image_clip = ImageSequenceClip(self.imgsRec, fps=15) print(self.imgsRec[0].shape, self.imgsRec[0].dtype, self.imgsRec[0].max()) audios = np.concatenate(self.wavsRec, axis = 1) audios = (np.transpose(audios) / 32768.).astype(np.float32) audioclip = AudioArrayClip(audios, fps=44100) print(audios.shape, audios.dtype, audios.max()) print(len(self.imgsRec) / 15, audios.shape[0] / 44100) video_clip = image_clip.set_audio(audioclip) video_clip.write_videofile("result.mp4", fps=15, temp_audiofile="temp-audio.m4a", remove_temp=True, codec="libx264", audio_codec="aac")
def speed_up(video_clip, speed): rate = 44100 # Speed up video video_clip = video_clip.speedx(speed) # Determine pitch shift from speed shift = (1 - speed if speed >= 1 else (1 / speed) - 1) * 12 # Fix audio pitch audio = video_clip.audio.to_soundarray(fps=rate).transpose() for i, channel in enumerate(audio): audio[i] = pitch_shift(channel, rate, shift) audio = audio.transpose() video_clip.audio = AudioArrayClip(audio, fps=rate) return video_clip
def audio2wav(audio, duration=None, **kwargs): """ The sound is converted into :epkg:`wav` and returned as an :epkg:`AudioArrayClip`. Le son est converti au format :epkg:`wav`. @param audio sound @param duration change the duration of the sound before converting it @param kwargs see `to_soundarray <https://zulko.github.io/moviepy/ref/AudioClip.html? highlight=to_soundarray#moviepy.audio.AudioClip.AudioClip.to_soundarray>`_ @return :epkg:`AudioArrayClip` """ with AudioContext(audio) as audio_: if duration is not None: audio_ = audio_.set_duration(duration) wav = audio_.to_soundarray(**kwargs) fps = kwargs.get('fps', audio_.fps if hasattr(audio_, 'fps') else None) if fps is None: raise ValueError("fps cannot be None, 44100 is a proper value") return AudioArrayClip(wav, fps=fps)
def npy_to_mp4(im_list, filename, fps=4, audio=None): """ :param im_list: :param filename: :param fps: :param audio: an array frames x samples, containing the audio samples per each frame :return: """ if isinstance(im_list, np.ndarray): im_list = list(im_list) if filename[-4:] != '.mp4': filename = filename + '.mp4' save_dir = '/'.join(str.split(filename, '/')[:-1]) if save_dir and not os.path.exists(save_dir): print('creating directory: ', save_dir) os.mkdir(save_dir) clip = mpy.ImageSequenceClip(im_list, fps=fps) if audio is not None: # moviepy always expects stereo audio for some reason, repeating second axis to emulate stereo. if len(audio.shape) == 2: samples_per_frame = audio.shape[1] else: samples_per_frame = audio.shape[0] / len(im_list) audio = audio.reshape(-1, 1).repeat(2, 1) audio_clip = AudioArrayClip(audio, fps=samples_per_frame * fps) clip = clip.set_audio(audio_clip) clip.write_videofile(filename, temp_audiofile=filename + '.m4a', remove_temp=True, codec="libx264", audio_codec="aac", verbose=False, logger=None)
def test_audioclip_io(): # Generate a random audio clip of 4.989 seconds at 44100 Hz, # and save it to a file. input_array = np.random.random((220000, 2)) * 1.98 - 0.99 clip = AudioArrayClip(input_array, fps=44100) clip.write_audiofile(os.path.join(TMP_DIR, "random.wav")) # Load the clip. # The loaded clip will be slightly longer because the duration is rounded # up to 4.99 seconds. # Verify that the extra frames are all zero, and the remainder is identical # to the original signal. clip = AudioFileClip(os.path.join(TMP_DIR, "random.wav")) output_array = clip.to_soundarray() np.testing.assert_array_almost_equal(output_array[:len(input_array)], input_array, decimal=4) assert (output_array[len(input_array):] == 0).all()
def merge_images_and_audio(images: DataList, audio: np.ndarray, video_duration: float, sound_hz: int, video_name: str): """ Creates video with sound from image list and music. Args: images: List of images represented by a h x w x 3 numpy array. audio: A Numpy array representing the sound, of size Nx1 for mono, Nx2 for stereo. video_duration: Duration of the video in seconds (should be the same as the audio file). sound_hz: The hz of the audio file. video_name: The name of the resulting video file """ # todo there is still a problem with the audio here # the audio should always contain two channels # then the hz should also work for mono and dual clip = ImageSequenceClip(images, durations=[video_duration / len(images)] * len(images)) s = audio.reshape((len(audio), 2)) # transform it from (N) to (N, 2) audio = AudioArrayClip(s, sound_hz) clip = clip.set_audio(audio) clip.write_videofile(video_name, fps=len(images) / video_duration)
def file_save_as_button_clicked(self, widget): filename = FileOp.choose_file(self, "save_as", file_types="audio", filename=self.filename) if filename: clip = AudioArrayClip(self.audio_samples.samples.T, fps=self.audio_samples.sample_rate) clip.write_audiofile(filename) self.set_filename(filename)
def generate_with_model(model, dataloader_test, params, epoch=0, verbose=True): res_dir = params['results_dir'] number_samples = 0 max_num_samples = 2 for bs_test, d in enumerate(dataloader_test.keys()): data = dataloader_test[d] img = data with torch.no_grad(): img = Variable(img).cuda() out, cnn_model, cnn_out, hidden_enc = model(img, None) # target=None input_data_tmp = img.cpu().data.numpy() outputs_data_tmp = out.squeeze().cpu().data.numpy() input_reshaped = np.reshape(input_data_tmp, [ -1, np.shape(input_data_tmp)[2], np.shape(input_data_tmp)[3], np.shape(input_data_tmp)[1] ]) out_reshaped = flatten_audio_with_params( outputs_data_tmp, params['sequence_length']) # params['audio_n_prediction']) if verbose: print("input_data_tmp: {}, outputs_data_tmp: {}".format( np.shape(input_data_tmp), np.shape(outputs_data_tmp))) print("input_reshaped: {}, out_reshaped: {}".format( np.shape(input_reshaped), np.shape(out_reshaped))) if number_samples < max_num_samples: # Save audio, 16KHz from scipy.io.wavfile import write test_gen_audio_scaled = np.int16( out_reshaped / np.max(np.abs(out_reshaped)) * 32767) write( '{}test_gen_ep{}_s{}.wav'.format(res_dir, epoch, number_samples), 16000, test_gen_audio_scaled[0]) scaled = input_reshaped from skimage import color frame_arr = [] frame_arr2 = [] for frame in scaled: frame = utils.normalize(frame, min=-1, max=1) frame_rgb = color.hsv2rgb(frame) frame_arr.append(frame_rgb * 255) frame_arr2.append(frame_rgb) if verbose: print("frame_arr: {}, min: {}, max: {}".format( np.shape(frame_arr), np.amin(frame_arr), np.amax(frame_arr))) print("frame_arr2: {}, min: {}, max: {}".format( np.shape(frame_arr2), np.amin(frame_arr2), np.amax(frame_arr2))) clip = ImageSequenceClip(np.array(frame_arr), fps=10) # 3-second clip, .tolist() clip.set_audio(AudioArrayClip(test_gen_audio_scaled, fps=16000)) clip.write_videofile( '{}test_rgbInputWithGenAudio_ep{}_s{}.mp4'.format( res_dir, epoch, number_samples), fps=10) # export as video number_samples += 1
def test_with_model(model, dataloader_label_test, dataloader_test, dataloader_audio_test, params, target_bin=True, epoch=0, verbose=False): correct_samples_cnn = 0 euc_dist_cnn = 0 num_samples = 0 test_size = len(dataloader_test.keys()) res_dir = params['results_dir'] number_samples = 0 max_num_samples = 2 for bs_test, (d, l, a) in enumerate( zip(dataloader_test.keys(), dataloader_label_test.keys(), dataloader_audio_test.keys())): label = dataloader_label_test[l] data = dataloader_test[d] audio = dataloader_audio_test[a] # print("Shapes1 - label: {}, data: {}".format(np.shape(label), np.shape(data))) img = data lab = label lab_audio = audio target = audio.reshape(-1, np.shape(audio)[-1], params['output_size']).to( params['gpu_num'][0]) # labels.to(device) with torch.no_grad(): img = Variable(img).cuda() #out, cnn_model, cnn_out, hidden_enc = model(img, np.array([[SOS_token]])) # target out, cnn_model, cnn_out, hidden_enc = model(img, target) # target if not target_bin: cnn_out = cnn_out.squeeze() # print("Shapes3 - label: {}, out: {}, img: {}".format(np.shape(lab), np.shape(output), np.shape(img))) input_data_tmp = img.cpu().data.numpy() target_data_tmp = target.squeeze().cpu().data.numpy() outputs_data_tmp = out.squeeze().cpu().data.numpy() #print("input_data_tmp: {}, target_data_tmp: {}, outputs_data_tmp: {}". # format(np.shape(input_data_tmp), np.shape(target_data_tmp), np.shape(outputs_data_tmp))) # input_reshaped = np.reshape(input_data, [np.shape(input_data)[1], np.shape(input_data)[0]]) input_reshaped = np.reshape(input_data_tmp, [ -1, np.shape(input_data_tmp)[2], np.shape(input_data_tmp)[3], np.shape(input_data_tmp)[1] ]) target_reshaped = flatten_audio_with_params( target_data_tmp, params['sequence_length']) #params['audio_n_prediction']) out_reshaped = flatten_audio_with_params( outputs_data_tmp, params['sequence_length']) #params['audio_n_prediction']) #print("input_reshaped: {}, out_reshaped: {}, target_reshaped: {}". # format(np.shape(input_reshaped), np.shape(out_reshaped), np.shape(target_reshaped))) if number_samples < max_num_samples: # Save audio, 16KHz from scipy.io.wavfile import write scaled = -1.0 + (1.0 - (-1.0)) * ( target_reshaped - np.amin(target_reshaped)) / ( np.amax(target_reshaped) - np.amin(target_reshaped)) test_target_audio_scaled = np.int16(scaled / np.max(np.abs(scaled)) * 32767) write( '{}test_target_ep{}_s{}.wav'.format(res_dir, epoch, number_samples), 16000, test_target_audio_scaled[0]) test_gen_audio_scaled = np.int16( out_reshaped / np.max(np.abs(out_reshaped)) * 32767) write( '{}test_gen_ep{}_s{}.wav'.format(res_dir, epoch, number_samples), 16000, test_gen_audio_scaled[0]) # TODO: save with target and generated audio (RGB and HSV versions, total 4 videos) # TODO: save as video #scaled = -1.0 + (1.0 - (-1.0)) * (input_reshaped - np.amin(input_reshaped)) / ( # np.amax(input_reshaped) - np.amin(input_reshaped)) scaled = input_reshaped from skimage import color frame_arr = [] frame_arr2 = [] for frame in scaled: frame = utils.normalize(frame, min=-1, max=1) frame_rgb = color.hsv2rgb(frame) frame_arr.append(frame_rgb * 255) frame_arr2.append(frame_rgb) print("frame_arr: {}, min: {}, max: {}".format( np.shape(frame_arr), np.amin(frame_arr), np.amax(frame_arr))) print("frame_arr2: {}, min: {}, max: {}".format( np.shape(frame_arr2), np.amin(frame_arr2), np.amax(frame_arr2))) clip = ImageSequenceClip(np.array(frame_arr), fps=10) # 3-second clip, .tolist() clip.set_audio(AudioArrayClip(test_target_audio_scaled, fps=16000)) clip.write_videofile( '{}test_rgbInputWithTargetAudio_ep{}_s{}.mp4'.format( res_dir, epoch, number_samples), fps=10) # export as video # .set_audio(AudioClip) # TODO: transform from HSV to RGB and save it as video # img_rgb = matplotlib.colors.hsv_to_rgb(scaled) # clip = ImageSequenceClip(np.array(img_rgb), fps=10) # 3-second clip, .tolist() # clip.set_audio(AudioArrayClip(test_target_audio_scaled, fps=16000)) # clip.write_videofile('{}test_rgbInputWithTargetAudio2_ep{}_s{}.mp4'.format(res_dir, epoch, number_samples), # fps=10) # export as video number_samples += 1 ''' for i in range(0, min(2, np.shape(target_data_tmp)[0])): input_data = input_data_tmp[i] target_data = target_data_tmp[i] outputs_data = outputs_data_tmp[i] print("input_data: {}, target_data: {}, outputs_data: {}". format(np.shape(input_data), np.shape(target_data), np.shape(outputs_data))) # input_reshaped = np.reshape(input_data, [np.shape(input_data)[1], np.shape(input_data)[0]]) input_reshaped = np.reshape(input_data, [np.shape(input_data)[1], np.shape(input_data)[2], np.shape(input_data)[0]]) target_reshaped = np.reshape(target_data, [np.shape(target_data)[1], np.shape(target_data)[0]]) target_reshaped = flatten_audio_with_params(target_reshaped, params['sequence_length'], params['audio_n_prediction']) out_reshaped = np.reshape(outputs_data, [np.shape(outputs_data)[1], np.shape(outputs_data)[0]]) out_reshaped = flatten_audio_with_params(out_reshaped, params['sequence_length'], params['audio_n_prediction']) # print(np.shape(out_reshaped)) print("input_reshaped: {}, out_reshaped: {}, target_reshaped: {}". format(np.shape(input_reshaped), np.shape(out_reshaped), np.shape(target_reshaped))) # Save audio, 16KHz from scipy.io.wavfile import write scaled = -1.0 + (1.0 - (-1.0)) * (target_reshaped - np.min(target_reshaped)) / ( np.max(target_reshaped) - np.min(target_reshaped)) test_target_audio_scaled = np.int16(scaled / np.max(np.abs(scaled)) * 32767) write('{}test_target_ep{}_{}.wav'.format(res_dir, epoch, i), 16000, test_target_audio_scaled[0]) test_gen_audio_scaled = np.int16(out_reshaped / np.max(np.abs(out_reshaped)) * 32767) write('{}test_gen_ep{}_{}.wav'.format(res_dir, epoch, i), 16000, test_gen_audio_scaled[0]) # TODO: save with target and generated audio (RGB and HSV versions, total 4 videos) # TODO: save as video scaled = -1.0 + (1.0 - (-1.0)) * (input_reshaped - np.amin(input_reshaped)) / ( np.amax(input_reshaped) - np.amin(input_reshaped)) # imsave('{}test_input_ep{}_{}.jpg'.format(res_dir, epoch, i), scaled) def make_frame_rgb(t): """ returns a numpy array of the frame at time t """ frame_for_time_t = scaled[t, :, :, :] return frame_for_time_t clip = VideoClip(make_frame_rgb, duration=3) # 3-second clip # clip.set_audio(AudioFileClip(test_target_audio_scaled, fps=16000)) clip.write_videofile('{}test_inputWithTargetAudio_ep{}_{}.mp4'.format(res_dir, epoch, i), fps=10) # export as video # .set_audio(AudioClip) # TODO: transform from HSV to RGB and save it as video # img_rgb = matplotlib.colors.hsv_to_rgb(img_hsv.reshape([100, 100, 3])) ''' # TODO: calculate NLL of generated and target audio # CNN accuracy pic = cnn_out.cpu().data gen_out = pic.numpy() lab = lab.numpy() # Performance evaluation by comparing target and obtained output for i, c in zip(lab, gen_out): # Accuracy if target_bin: i_max_idx = np.argmax(i) c_max_idx = np.argmax(c) else: i_max_idx = i c_max_idx = round(c) if i_max_idx == c_max_idx: correct_samples_cnn += 1 # Euclidian distance: # print("Shapes = i: {}, c: {}".format(np.shape(i), np.shape(c))) euc_dist_cnn += np.linalg.norm(i - c) num_samples += 1 if verbose: print("[Tested {}%] acc: {}%, euc {}".format( round(100 * (bs_test + 1) / test_size, 2), round(100 * correct_samples_cnn / num_samples, 2), round(euc_dist_cnn / num_samples, 2))) correct_samples_perc_cnn = 100 * correct_samples_cnn / num_samples euc_dist_mean_cnn = euc_dist_cnn / num_samples return correct_samples_perc_cnn, euc_dist_mean_cnn
def prepare_audio(file_name, rate=44100, speedup=1.3): audio = AudioFileClip(file_name) data = audio.to_soundarray(fps=rate) data = speedup_audio(data, speedup) return AudioArrayClip(data, fps=rate)
starting_times = (9, 25, 40, 52, 65, 77, 89, 121, 137, 157, 170, 184) # times in seconds when each note starts in the input video length = 8 # length of the generated clips octaves = (0, 9) # range of octaves resolution = (600, 800) # video resolution of the outputs notes = ['A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#'] # name of the notes src_clips = [VideoFileClip(input_file).\ subclip(start, start + length).\ volumex(0.6) for start in starting_times] # this list contains the clips for each note in the input video for n in range(octaves[0] * 12, octaves[1] * 12): note_name = f"{n // 12}{notes[(n % 12)]}" clip = src_clips[(n - 3) % 12].copy() # the '-3' is because notes starts with 'C' but during the generation they starts from 'A' trans = (((n - 3) // 12) - 5) * 12 # transposition offset clip.audio = AudioArrayClip( np.transpose( np.stack([ pitch_shift( # this function does the trick np.transpose(clip.audio.to_soundarray())[channel], 48000, # it's the sampling of the source video n_steps=trans) for channel in (0, 1) ])), # (0, 1) because the source is stereo fps=48000) clip.write_videofile(f"{note_name}.mp4")
def frames_to_audio(frames: np.ndarray, fps: float): return AudioArrayClip(frames, fps)
def __init__( self, song, path_video: str, artist_name: str, track_name: str, music_bpm: float, drop_beats=None, square=True, ): """Generic video object. This class contains all parameters common to all styles of videos to be generated. :param song: Song :param artist_name: Artist name :param track_name: Track name :param music_bpm: BPM of the music :param square: Set the size of the clip as "square" """ # Size self.width = None self.height = None self.set_params() # Video self.main_clip = VideoFileClip(path_video, fps_source="fps") self.bpm_video = None self.drop_beats = drop_beats if square: self.main_clip = self.transform_squared_size(self.main_clip, prop=0.8) self.background_clips = [] # Song self.song = song self.audioclip = AudioArrayClip(self.song.waveform.reshape(-1, 2), fps=self.song.sr) # Rhythm self.music_bpm = music_bpm self.duration = 60 / self.music_bpm # Informations to display self.artist_name = artist_name self.track_name = track_name self.text_assets = TextAssets( self.width, self.height, self.artist_name, self.track_name, self.duration, self.drop_beats, )