Esempio n. 1
0
def multimodal_analysis(video_path, output, tmp_dir, duration, output_path):

    video_frames_dir = os.path.abspath(os.path.join(tmp_dir, 'video_frames'))

    if not os.path.isdir(video_frames_dir):
        extract_images_from_video(os.path.abspath(video_path),
                                  video_frames_dir)

    generate_face_based_segmentation(
        output, os.path.abspath(os.path.join(tmp_dir, 'video_frames')),
        os.path.abspath(output_path), args.speakers,
        os.path.abspath('models/shape_predictor_68_face_landmarks.dat'),
        os.path.abspath('models/dlib_face_recognition_resnet_model_v1.dat'),
        tmp_dir)

    # fusion
    mapping_face_to_voice = calculate_fusion(
        output,
        os.path.abspath(output_path),
        Util.read_audacity_labels(
            os.path.join(os.path.abspath(output_path),
                         '%s.audio.txt' % output)),
        Util.read_audacity_labels(
            os.path.join(os.path.abspath(output_path),
                         '%s.image.txt' % output)),
        duration,
        step=0.05,
        neighbours_before_after=40,
        times_greater=4)

    mapping_face_to_voice_json = os.path.join(
        os.path.abspath(output_path), output + ".mapping_face_to_voice.json")
    with open(mapping_face_to_voice_json, 'w') as f:
        json.dump(mapping_face_to_voice, f)
Esempio n. 2
0
def get_features(features, datasets_dir, pca=False):
    timestamps_gtzan, feature_vectors_gtzan = Util.read_merged_features(
        datasets_dir + "/gtzan/gtzan_combined.wav", features)
    labels_gtzan = Util.read_audacity_labels(datasets_dir +
                                             "/gtzan/gtzan_combined.txt")
    X_gtzan, Y_gtzan, lbls_gtzan = Util.get_annotated_data_x_y(
        timestamps_gtzan, feature_vectors_gtzan, labels_gtzan)

    timestamps_labrosa, feature_vectors_labrosa = Util.read_merged_features(
        datasets_dir + "/labrosa/labrosa_combined.wav", features)
    labels_labrosa = Util.read_audacity_labels(datasets_dir +
                                               "/labrosa/labrosa_combined.txt")
    X_labrosa, Y_labrosa, lbls_labrosa = Util.get_annotated_data_x_y(
        timestamps_labrosa, feature_vectors_labrosa, labels_labrosa)

    timestamps_mirex, feature_vectors_mirex = Util.read_merged_features(
        datasets_dir + "/mirex/mirex_combined.wav", features)
    labels_mirex = Util.read_audacity_labels(datasets_dir +
                                             "/mirex/mirex_combined.txt")
    X_mirex, Y_mirex, lbls_mirex = Util.get_annotated_data_x_y(
        timestamps_mirex, feature_vectors_mirex, labels_mirex)

    scaler = StandardScaler()
    scaler.fit(np.concatenate((X_labrosa, X_gtzan, X_mirex)))
    with open("pickled/scaler.pickle", 'w') as f:
        pickle.dump(scaler, f)
    X_gtzan = scaler.transform(X_gtzan)
    X_labrosa = scaler.transform(X_labrosa)
    X_mirex = scaler.transform(X_mirex)

    if pca:
        pca = PCA(n_components=20)
        pca.fit(np.concatenate((X_labrosa, X_gtzan, X_mirex)))
        X_gtzan = pca.transform(X_gtzan)
        X_labrosa = pca.transform(X_labrosa)
        X_mirex = pca.transform(X_mirex)

    data = {
        "x_gtzan": X_gtzan,
        "y_gtzan": Y_gtzan,
        "labels_gtzan": labels_gtzan,
        "x_labrosa": X_labrosa,
        "y_labrosa": Y_labrosa,
        "labels_labrosa": labels_labrosa,
        "x_mirex": X_mirex,
        "y_mirex": Y_mirex,
        "labels_mirex": labels_mirex,
        "timestamps_gtzan": timestamps_gtzan,
        "timestamps_labrosa": timestamps_labrosa,
        "timestamps_mirex": timestamps_mirex
    }

    return data
Esempio n. 3
0
def calculate_der(reference_filename, hypothesis_filename):
    lbls = Util.read_audacity_labels(reference_filename)
    reference = Annotation()
    for lbl in lbls:
        reference[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label

    predicted_lbls = Util.read_audacity_labels(hypothesis_filename)
    hypothesis = Annotation()
    for lbl in predicted_lbls:
        if lbl.label != 'non_speech':
            hypothesis[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label

    metric = DiarizationErrorRate()
    der = metric(reference, hypothesis)
    return der
Esempio n. 4
0
def x(youtube_video_id):
    job = Job.query.filter_by(video_id=youtube_video_id).first()
    if job is not None:
        try:
            job.start_time = datetime.utcnow()
            subprocess.check_call([
                'youtube-dl', '-f', '18', '--write-thumbnail', '-o',
                'videos/%(id)s.%(ext)s',
                'https://www.youtube.com/watch?v=%s' % youtube_video_id
            ])
            copyfile('videos/%s.jpg' % youtube_video_id,
                     'static/img/thumbs/%s.jpg' % youtube_video_id)
            set_state(State.VIDEO_DOWNLOADED, db, job)

            subprocess.check_call([
                'ffmpeg', '-y', '-i',
                'videos/%s.mp4' % youtube_video_id, '-ar', '16000', '-ac', '1',
                'audios/%s.wav' % youtube_video_id
            ])
            set_state(State.AUDIO_EXTRACTED, db, job)

            y, sr = librosa.load("audios/%s.wav" % youtube_video_id, sr=16000)
            D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
            D = np.flipud(D)
            D8 = (((D - D.min()) / (D.max() - D.min())) * 255.9).astype(
                np.uint8)
            img = Image.fromarray(D8)
            img = img.resize((D.shape[1], 64))
            img.save("static/img/waveforms/%s.jpg" % youtube_video_id)
            duration = librosa.get_duration(y=y, sr=sr)
            job.waveform_width = D.shape[1]
            job.duration = duration
            set_state(State.WAVEFORM_GENERATED, db, job)

            if duration > 600:
                raise Exception("Video duration greated than 10 min (600 sec)")

            # audio based segmentation

            generate_audio_based_segmentation(
                os.path.abspath('audios/%s.wav' % youtube_video_id),
                15,
                20,
                256,
                128,
                0.2,
                os.path.abspath('models/weights.h5'),
                os.path.abspath('models/scaler.pickle'),
                1024,
                3,
                1024,
                youtube_video_id,
                os.path.abspath('static/lbls/audio'),
                clusters=job.number_of_speakers)
            set_state(State.AUDIO_DATA_ANALYSED, db, job)

            steps.mfcc.generate_audio_based_segmentation(
                os.path.abspath('audios/%s.wav' % youtube_video_id),
                15,
                20,
                256,
                128,
                0.2,
                os.path.abspath('models/weights.h5'),
                os.path.abspath('models/scaler.pickle'),
                1024,
                3,
                1024,
                youtube_video_id,
                os.path.abspath('static/lbls/mfcc'),
                clusters=job.number_of_speakers)

            set_state(State.MFCC_ANALYSED, db, job)

            # face based segmentation
            extract_images_from_video(
                os.path.abspath('videos/%s.mp4' % youtube_video_id),
                os.path.abspath('video_frames'))
            generate_face_based_segmentation(
                youtube_video_id,
                os.path.abspath('video_frames/%s' % youtube_video_id),
                os.path.abspath('static/lbls/image'), job.number_of_speakers,
                os.path.abspath(
                    'models/shape_predictor_68_face_landmarks.dat'),
                os.path.abspath(
                    'models/dlib_face_recognition_resnet_model_v1.dat'))
            set_state(State.IMAGE_DATA_ANALYSED, db, job)

            # fusion
            mapping_face_to_voice = calculate_fusion(
                youtube_video_id, os.path.abspath('static/lbls/fusion'),
                Util.read_audacity_labels(
                    os.path.abspath('static/lbls/audio/%s.txt' %
                                    youtube_video_id)),
                Util.read_audacity_labels(
                    os.path.abspath('static/lbls/image/%s.txt' %
                                    youtube_video_id)), duration)
            job.mapping_face_to_voice = mapping_face_to_voice
            set_state(State.FUSION_APPLIED, db, job)

            job.end_time = datetime.utcnow()
            set_state(State.DONE, db, job)

        except Exception as e:
            print(e)
            job.end_time = datetime.utcnow()
            set_state(State.ERROR, db, job, str(e))
            raise e