def multimodal_analysis(video_path, output, tmp_dir, duration, output_path): video_frames_dir = os.path.abspath(os.path.join(tmp_dir, 'video_frames')) if not os.path.isdir(video_frames_dir): extract_images_from_video(os.path.abspath(video_path), video_frames_dir) generate_face_based_segmentation( output, os.path.abspath(os.path.join(tmp_dir, 'video_frames')), os.path.abspath(output_path), args.speakers, os.path.abspath('models/shape_predictor_68_face_landmarks.dat'), os.path.abspath('models/dlib_face_recognition_resnet_model_v1.dat'), tmp_dir) # fusion mapping_face_to_voice = calculate_fusion( output, os.path.abspath(output_path), Util.read_audacity_labels( os.path.join(os.path.abspath(output_path), '%s.audio.txt' % output)), Util.read_audacity_labels( os.path.join(os.path.abspath(output_path), '%s.image.txt' % output)), duration, step=0.05, neighbours_before_after=40, times_greater=4) mapping_face_to_voice_json = os.path.join( os.path.abspath(output_path), output + ".mapping_face_to_voice.json") with open(mapping_face_to_voice_json, 'w') as f: json.dump(mapping_face_to_voice, f)
def get_features(features, datasets_dir, pca=False): timestamps_gtzan, feature_vectors_gtzan = Util.read_merged_features( datasets_dir + "/gtzan/gtzan_combined.wav", features) labels_gtzan = Util.read_audacity_labels(datasets_dir + "/gtzan/gtzan_combined.txt") X_gtzan, Y_gtzan, lbls_gtzan = Util.get_annotated_data_x_y( timestamps_gtzan, feature_vectors_gtzan, labels_gtzan) timestamps_labrosa, feature_vectors_labrosa = Util.read_merged_features( datasets_dir + "/labrosa/labrosa_combined.wav", features) labels_labrosa = Util.read_audacity_labels(datasets_dir + "/labrosa/labrosa_combined.txt") X_labrosa, Y_labrosa, lbls_labrosa = Util.get_annotated_data_x_y( timestamps_labrosa, feature_vectors_labrosa, labels_labrosa) timestamps_mirex, feature_vectors_mirex = Util.read_merged_features( datasets_dir + "/mirex/mirex_combined.wav", features) labels_mirex = Util.read_audacity_labels(datasets_dir + "/mirex/mirex_combined.txt") X_mirex, Y_mirex, lbls_mirex = Util.get_annotated_data_x_y( timestamps_mirex, feature_vectors_mirex, labels_mirex) scaler = StandardScaler() scaler.fit(np.concatenate((X_labrosa, X_gtzan, X_mirex))) with open("pickled/scaler.pickle", 'w') as f: pickle.dump(scaler, f) X_gtzan = scaler.transform(X_gtzan) X_labrosa = scaler.transform(X_labrosa) X_mirex = scaler.transform(X_mirex) if pca: pca = PCA(n_components=20) pca.fit(np.concatenate((X_labrosa, X_gtzan, X_mirex))) X_gtzan = pca.transform(X_gtzan) X_labrosa = pca.transform(X_labrosa) X_mirex = pca.transform(X_mirex) data = { "x_gtzan": X_gtzan, "y_gtzan": Y_gtzan, "labels_gtzan": labels_gtzan, "x_labrosa": X_labrosa, "y_labrosa": Y_labrosa, "labels_labrosa": labels_labrosa, "x_mirex": X_mirex, "y_mirex": Y_mirex, "labels_mirex": labels_mirex, "timestamps_gtzan": timestamps_gtzan, "timestamps_labrosa": timestamps_labrosa, "timestamps_mirex": timestamps_mirex } return data
def calculate_der(reference_filename, hypothesis_filename): lbls = Util.read_audacity_labels(reference_filename) reference = Annotation() for lbl in lbls: reference[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label predicted_lbls = Util.read_audacity_labels(hypothesis_filename) hypothesis = Annotation() for lbl in predicted_lbls: if lbl.label != 'non_speech': hypothesis[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label metric = DiarizationErrorRate() der = metric(reference, hypothesis) return der
def x(youtube_video_id): job = Job.query.filter_by(video_id=youtube_video_id).first() if job is not None: try: job.start_time = datetime.utcnow() subprocess.check_call([ 'youtube-dl', '-f', '18', '--write-thumbnail', '-o', 'videos/%(id)s.%(ext)s', 'https://www.youtube.com/watch?v=%s' % youtube_video_id ]) copyfile('videos/%s.jpg' % youtube_video_id, 'static/img/thumbs/%s.jpg' % youtube_video_id) set_state(State.VIDEO_DOWNLOADED, db, job) subprocess.check_call([ 'ffmpeg', '-y', '-i', 'videos/%s.mp4' % youtube_video_id, '-ar', '16000', '-ac', '1', 'audios/%s.wav' % youtube_video_id ]) set_state(State.AUDIO_EXTRACTED, db, job) y, sr = librosa.load("audios/%s.wav" % youtube_video_id, sr=16000) D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max) D = np.flipud(D) D8 = (((D - D.min()) / (D.max() - D.min())) * 255.9).astype( np.uint8) img = Image.fromarray(D8) img = img.resize((D.shape[1], 64)) img.save("static/img/waveforms/%s.jpg" % youtube_video_id) duration = librosa.get_duration(y=y, sr=sr) job.waveform_width = D.shape[1] job.duration = duration set_state(State.WAVEFORM_GENERATED, db, job) if duration > 600: raise Exception("Video duration greated than 10 min (600 sec)") # audio based segmentation generate_audio_based_segmentation( os.path.abspath('audios/%s.wav' % youtube_video_id), 15, 20, 256, 128, 0.2, os.path.abspath('models/weights.h5'), os.path.abspath('models/scaler.pickle'), 1024, 3, 1024, youtube_video_id, os.path.abspath('static/lbls/audio'), clusters=job.number_of_speakers) set_state(State.AUDIO_DATA_ANALYSED, db, job) steps.mfcc.generate_audio_based_segmentation( os.path.abspath('audios/%s.wav' % youtube_video_id), 15, 20, 256, 128, 0.2, os.path.abspath('models/weights.h5'), os.path.abspath('models/scaler.pickle'), 1024, 3, 1024, youtube_video_id, os.path.abspath('static/lbls/mfcc'), clusters=job.number_of_speakers) set_state(State.MFCC_ANALYSED, db, job) # face based segmentation extract_images_from_video( os.path.abspath('videos/%s.mp4' % youtube_video_id), os.path.abspath('video_frames')) generate_face_based_segmentation( youtube_video_id, os.path.abspath('video_frames/%s' % youtube_video_id), os.path.abspath('static/lbls/image'), job.number_of_speakers, os.path.abspath( 'models/shape_predictor_68_face_landmarks.dat'), os.path.abspath( 'models/dlib_face_recognition_resnet_model_v1.dat')) set_state(State.IMAGE_DATA_ANALYSED, db, job) # fusion mapping_face_to_voice = calculate_fusion( youtube_video_id, os.path.abspath('static/lbls/fusion'), Util.read_audacity_labels( os.path.abspath('static/lbls/audio/%s.txt' % youtube_video_id)), Util.read_audacity_labels( os.path.abspath('static/lbls/image/%s.txt' % youtube_video_id)), duration) job.mapping_face_to_voice = mapping_face_to_voice set_state(State.FUSION_APPLIED, db, job) job.end_time = datetime.utcnow() set_state(State.DONE, db, job) except Exception as e: print(e) job.end_time = datetime.utcnow() set_state(State.ERROR, db, job, str(e)) raise e