Beispiel #1
0
    def test_generate_labels_from_classifications(self):
        classifications = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1]
        timestamps = [0.0, 0.6965986394557823, 1.3931972789115645, 2.089795918367347, 2.786394557823129,
                      3.4829931972789114, 4.179591836734694, 4.876190476190477, 5.572789115646258, 6.2693877551020405,
                      6.965986394557823, 7.662585034013605, 8.359183673469389, 9.05578231292517, 9.752380952380953,
                      10.448979591836734, 11.145578231292516, 11.842176870748299, 12.538775510204081,
                      13.235374149659863, 13.931972789115646, 14.628571428571428, 15.32517006802721, 16.021768707482995]

        labels = Util.generate_labels_from_classifications(classifications, timestamps)

        expected_labels = [AudacityLabel(0.0, 1.3931972789115645, 1),
                           AudacityLabel(1.3931972789115645, 9.752380952380953, 0),
                           AudacityLabel(9.752380952380953, 10.448979591836736, 1),
                           AudacityLabel(10.448979591836734, 11.145578231292516, 0),
                           AudacityLabel(11.145578231292516, 12.538775510204081, 1),
                           AudacityLabel(12.538775510204081, 13.235374149659863, 0),
                           AudacityLabel(13.235374149659863, 16.718367346938777, 1)]

        self.assertListEqual(expected_labels, labels)
def calculate_fusion(youtube_video_id,
                     lbls_dir,
                     audio_lbls,
                     image_lbls,
                     duration,
                     step=0.1,
                     neighbours_before_after=6,
                     times_greater=2):  # 100ms

    pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step)
    mapping_face_to_voice = detect_face_voice_mapping(pairs)
    # print(mapping_face_to_voice)
    pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice)
    # print(pairs)
    seconds_of_mismatch = 0
    for k, pair in enumerate(pairs):
        if pair.image_class is None:
            # when image is None
            continue
        classes = pair.image_class.split(",")
        # if only one face has been detected then assume it's the face of the speaker
        if len(classes) == 1 and pair.audio_class != 'non_speech':
            if pair.image_class != pair.audio_class:
                seconds_of_mismatch += step
                # print("%s != %s" % (pair.image_class, pair.audio_class))
                nearest_neighbour_class = find_nearest_neighbours_class(
                    k,
                    pairs,
                    neighbours_before_after=neighbours_before_after,
                    times_greater=times_greater)
                pair.audio_class = nearest_neighbour_class

    lbls = Util.generate_labels_from_classifications(
        [p.audio_class for p in pairs], timestamps)
    lbls = list(filter(lambda x: x.label is not None, lbls))

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".fusion.txt"))
    return mapping_face_to_voice
Beispiel #3
0
def calculate_fusion(youtube_video_id, lbls_dir, audio_lbls, image_lbls, duration, step=0.1): # 100ms

    pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step)
    mapping_face_to_voice = detect_face_voice_mapping(pairs)
    print(mapping_face_to_voice)
    pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice)
    print(pairs)

    for k, pair in enumerate(pairs):
        if pair.image_class is None:
            # when image is None
            continue
        classes = pair.image_class.split(",")
        if len(classes) == 1 and pair.audio_class != 'non_speech':
            if pair.image_class != pair.audio_class:
                # print("%s != %s" % (pair.image_class, pair.audio_class))
                nearest_neighbour_class = find_nearest_neighbours_class(k, pairs)
                pair.audio_class = nearest_neighbour_class

    print(pairs)

    lbls = Util.generate_labels_from_classifications([p.audio_class for p in pairs], timestamps)
    lbls = filter(lambda x: x.label is not None, lbls)

    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))
    return mapping_face_to_voice
Beispiel #4
0
def generate_audio_based_segmentation(audio_file,
                                      w,
                                      h,
                                      embedding_size,
                                      lstm_nodes,
                                      dropout,
                                      weights_filename,
                                      scaler_filename,
                                      window_size,
                                      step,
                                      hop_size,
                                      youtube_video_id,
                                      lbls_dir,
                                      clusters=4,
                                      sr=16000):
    vad = Vad()
    vad_lbls = vad.detect_voice_segments(audio_file)

    # model.load_weights(weights_filename)
    # feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step)
    # X, timestamps = feature_extractor.extract(audio_file)
    # timestamps = numpy.array(timestamps)

    y, sr = librosa.load(audio_file, sr=sr)
    X = mfcc(y, sr=sr, n_mfcc=h, n_fft=window_size, hop_length=hop_size)

    timestamps = [k * hop_size / sr for k in range(0, X.shape[1])]
    timestamps = numpy.array(timestamps)
    window = timestamps[1] - timestamps[0]

    frame_predictions = []

    for k, timestamp in enumerate(timestamps):
        found = False
        for lbl in vad_lbls:
            if lbl.start_seconds <= timestamp <= lbl.end_seconds - window:  # need the window end to fit in the label
                frame_predictions.append(lbl.label)
                found = True
                break
        if not found:
            frame_predictions.append('non_speech')

    frame_predictions = numpy.array(frame_predictions)

    speech_indices = numpy.where(frame_predictions == 'speech')

    X_speech = X[:, speech_indices]
    X_speech = X_speech.reshape((X_speech.shape[0], X_speech.shape[2]))
    print(X_speech.shape)
    X_speech = X_speech.transpose()
    print(X_speech.shape)
    timestamps_speech = timestamps[speech_indices]

    # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps),
    #                            "vad_preds_quant.txt")

    # X = X_speech.reshape((X_speech.shape[0] * w, h))
    # X = scaler.transform(X)
    # X = X.reshape(-1, w, h)

    # original_embeddings = intermediate.predict(X)

    clustering_algorithm = GaussianMixture(n_components=clusters,
                                           max_iter=1000,
                                           n_init=3)

    # if visualise:
    #
    #     embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings,
    #                                                                          lbls_fixed)
    #     le = preprocessing.LabelEncoder()
    #     y = le.fit_transform(y)
    #
    #     tsne = TSNE()
    #     two_dimensional = tsne.fit_transform(embeddings)
    #
    #     #         pca = PCA(n_components=2)
    #     #         two_dimensional = pca.fit_transform(embeddings)
    #
    #     #         pca2 = PCA(n_components=20)
    #     #         pca_embeddings = pca2.fit_transform(embeddings)
    #
    #     clustering_algorithm.fit(two_dimensional)
    #     predictions = clustering_algorithm.predict(two_dimensional)
    #
    #     #        kmeans = KMeans(n_clusters=CLUSTERS)
    #     #        kmeans.fit(embeddings)
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.')
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')
    #
    # else:
    tsne = TSNE(n_components=2, init='pca')
    two_dimensional = tsne.fit_transform(X_speech)

    #         original_embeddings = scale((original_embeddings))

    #         pca = PCA(n_components=2)
    #         two_dimensional = pca.fit_transform(original_embeddings)

    #         pca2 = PCA(n_components=3)
    #         pca_embeddings = pca2.fit_transform(original_embeddings)

    clustering_algorithm.fit(two_dimensional)
    predictions = clustering_algorithm.predict(two_dimensional)

    #         kmeans = KMeans(n_clusters=CLUSTERS)
    #         kmeans.fit(two_dimensional)

    # plt.figure(figsize=(10,10))
    # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray')

    # plt.figure(figsize=(10,6))
    # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')

    #         plt.figure(figsize=(10,6))
    #         plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.')

    #         predictions = kmeans.labels_.tolist()

    for k, speech_index in enumerate(speech_indices[0]):
        frame_predictions[speech_index] = predictions[k]

    lbls = Util.generate_labels_from_classifications(frame_predictions,
                                                     timestamps)

    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"),
              'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))


# if __name__ == '__main__':
#     generate_audio_based_segmentation(
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/audios/Unamij6z1io.wav",
#         15, 20, 256, 128, 0.2,
#         os.path.abspath('models/weights.h5'),
#         os.path.abspath('models/scaler.pickle'),
#         1024, 3, 1024, "xxx",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/mfcc",
#         clusters=4
#     )
Beispiel #5
0
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir,
                                     faces, predictor_path,
                                     face_rec_model_path):

    images_raw = glob(os.path.join(images_dir, "*.jpg"))
    images_raw.sort()
    # images_raw = images_raw[0:100]
    images = [
        images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP)
    ]
    print(images)
    timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))]
    print(timestamps)

    detector = dlib.get_frontal_face_detector()
    sp = dlib.shape_predictor(predictor_path)
    facerec = dlib.face_recognition_model_v1(face_rec_model_path)

    embeddings = []
    embeddings_timestamps = []
    landmarks_parts = []
    landmarks_rect = []

    for frame_no, f in enumerate(images):
        print("Processing file: {}".format(f))
        img = io.imread(f)

        dets = detector(img, 1)
        print("Number of faces detected: {}".format(len(dets)))

        for k, d in enumerate(dets):
            shape = sp(img, d)
            face_descriptor = facerec.compute_face_descriptor(img, shape)
            embeddings.append(face_descriptor)
            embeddings_timestamps.append(timestamps[frame_no])
            landmarks_parts.append(shape.parts())
            landmarks_rect.append(shape.rect)

    embeddings = numpy.array(embeddings)
    embeddings_timestamps = numpy.array(embeddings_timestamps)

    print(embeddings.shape)
    print(embeddings_timestamps.shape)

    if len(embeddings) == 0:
        Util.write_audacity_labels([],
                                   os.path.join(lbls_dir,
                                                youtube_video_id + ".txt"))
        return

    kmeans = KMeans(n_clusters=faces)
    kmeans.fit(embeddings)

    predictions = numpy.array(kmeans.labels_.tolist())
    df = pd.DataFrame({
        "timestamps": embeddings_timestamps.tolist(),
        "predictions": predictions
    })

    timestamps = []
    classes = []

    for key, group in df.groupby(['timestamps']):
        timestamps.append(key)
        classes.append(",".join(
            [str(i) for i in sorted(group['predictions'].tolist())]))

    lbls = Util.generate_labels_from_classifications(classes, timestamps)
    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"),
              'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))


# if __name__ == '__main__':
#
#     extract_images_from_video("/Users/nicktgr15/workspace/speaker_diarisation_poc/src/videos/Unamij6z1io.mp4",
#                               "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames")
#
#     generate_face_based_segmentation(
#         "Unamij6z1io",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames/Unamij6z1io",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/image",
#         4,
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/shape_predictor_68_face_landmarks.dat",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/dlib_face_recognition_resnet_model_v1.dat"
#     )
def generate_audio_based_segmentation(audio_file,
                                      w,
                                      h,
                                      embedding_size,
                                      lstm_nodes,
                                      dropout,
                                      weights_filename,
                                      scaler_filename,
                                      window_size,
                                      step,
                                      hop_size,
                                      youtube_video_id,
                                      lbls_dir,
                                      clusters=4,
                                      sr=16000):
    vad = Vad()
    vad_lbls = vad.detect_voice_segments(audio_file)
    with open(scaler_filename, 'rb') as f:
        scaler = pickle.load(f)

    model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes,
                                           dropout)
    model.load_weights(weights_filename)
    feature_extractor = FeatExtractorMFCC(window_size,
                                          hop_size,
                                          w,
                                          sr,
                                          h,
                                          step=step)
    X, timestamps = feature_extractor.extract(audio_file)
    timestamps = numpy.array(timestamps)
    window = timestamps[1] - timestamps[0]

    frame_predictions = []

    for k, timestamp in enumerate(timestamps):
        found = False
        for lbl in vad_lbls:
            if lbl.start_seconds <= timestamp <= lbl.end_seconds - window:  # need the window end to fit in the label
                frame_predictions.append(lbl.label)
                found = True
                break
        if not found:
            frame_predictions.append('non_speech')

    frame_predictions = numpy.array(frame_predictions)
    print(frame_predictions.shape)
    print(timestamps.shape)

    speech_indices = numpy.where(frame_predictions == 'speech')

    X_speech = X[speech_indices]

    X = X_speech.reshape((X_speech.shape[0] * w, h))
    X = scaler.transform(X)
    X = X.reshape(-1, w, h)

    original_embeddings = intermediate.predict(X)

    clustering_algorithm = KMeans(n_clusters=clusters)

    reducted_embeddings = original_embeddings
    predictions = clustering_algorithm.fit_predict(reducted_embeddings)

    for k, speech_index in enumerate(speech_indices[0]):
        frame_predictions[speech_index] = predictions[k]

    lbls = Util.generate_labels_from_classifications(frame_predictions,
                                                     timestamps)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".audio.txt"))
Beispiel #7
0
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir,
                                     faces, predictor_path,
                                     face_rec_model_path, tmp_dir):

    images_raw = glob(os.path.join(images_dir, "*.jpg"))
    images_raw.sort()
    # images_raw = images_raw[0:100]
    images = [
        images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP)
    ]
    print(images)
    timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))]
    print(timestamps)

    detector = get_frontal_face_detector()
    sp = dlib.shape_predictor(predictor_path)
    facerec = face_recognition_model_v1(face_rec_model_path)

    embeddings = []
    embeddings_timestamps = []
    landmarks_parts = []
    landmarks_rect = []

    embeddings_pickle = os.path.join(tmp_dir, "embeddings.npy")
    embeddings_timestamps_pickle = os.path.join(tmp_dir,
                                                "embeddings_timestamps.npy")

    if not os.path.isfile(embeddings_pickle) or not os.path.isfile(
            embeddings_timestamps_pickle):

        for frame_no, f in enumerate(images):
            print("Processing file: {}".format(f))
            img = io.imread(f)

            dets = detector(img, 1)
            print("Number of faces detected: {}".format(len(dets)))

            for k, d in enumerate(dets):
                shape = sp(img, d)
                face_descriptor = facerec.compute_face_descriptor(img, shape)
                embeddings.append(face_descriptor)
                embeddings_timestamps.append(timestamps[frame_no])
                landmarks_parts.append(shape.parts())
                landmarks_rect.append(shape.rect)

        embeddings = numpy.array(embeddings)
        embeddings_timestamps = numpy.array(embeddings_timestamps)
        numpy.save(embeddings_pickle, embeddings)
        numpy.save(embeddings_timestamps_pickle, embeddings_timestamps)
    else:
        embeddings = numpy.load(embeddings_pickle)
        embeddings_timestamps = numpy.load(embeddings_timestamps_pickle)

    print(embeddings.shape)
    print(embeddings_timestamps.shape)

    if len(embeddings) == 0:
        Util.write_audacity_labels([],
                                   os.path.join(
                                       lbls_dir,
                                       youtube_video_id + ".image.txt"))
        return

    kmeans = KMeans(n_clusters=faces)
    kmeans.fit(embeddings)

    predictions = numpy.array(kmeans.labels_.tolist())
    df = pd.DataFrame({
        "timestamps": embeddings_timestamps.tolist(),
        "predictions": predictions
    })

    timestamps = []
    classes = []

    for key, group in df.groupby(['timestamps']):
        timestamps.append(key)
        classes.append(",".join(
            [str(i) for i in sorted(group['predictions'].tolist())]))

    lbls = Util.generate_labels_from_classifications(classes, timestamps)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".image.txt"))
def generate_audio_based_segmentation(audio_file,
                                      w,
                                      h,
                                      embedding_size,
                                      lstm_nodes,
                                      dropout,
                                      weights_filename,
                                      scaler_filename,
                                      window_size,
                                      step,
                                      hop_size,
                                      youtube_video_id,
                                      lbls_dir,
                                      clusters=4,
                                      sr=16000):
    vad = Vad()
    vad_lbls = vad.detect_voice_segments(audio_file)
    with open(scaler_filename, 'rb') as f:
        scaler = pickle.load(f)

    model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes,
                                           dropout)
    model.load_weights(weights_filename)
    feature_extractor = FeatExtractorMFCC(window_size,
                                          hop_size,
                                          w,
                                          sr,
                                          h,
                                          step=step)
    X, timestamps = feature_extractor.extract(audio_file)
    timestamps = numpy.array(timestamps)
    window = timestamps[1] - timestamps[0]

    frame_predictions = []

    for k, timestamp in enumerate(timestamps):
        found = False
        for lbl in vad_lbls:
            if lbl.start_seconds <= timestamp <= lbl.end_seconds - window:  # need the window end to fit in the label
                frame_predictions.append(lbl.label)
                found = True
                break
        if not found:
            frame_predictions.append('non_speech')

    frame_predictions = numpy.array(frame_predictions)
    print(frame_predictions.shape)
    print(timestamps.shape)

    speech_indices = numpy.where(frame_predictions == 'speech')

    X_speech = X[speech_indices]
    timestamps_speech = timestamps[speech_indices]

    # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps),
    #                            "vad_preds_quant.txt")

    X = X_speech.reshape((X_speech.shape[0] * w, h))
    X = scaler.transform(X)
    X = X.reshape(-1, w, h)

    original_embeddings = intermediate.predict(X)

    clustering_algorithm = GaussianMixture(n_components=clusters,
                                           max_iter=1000,
                                           n_init=3)

    # if visualise:
    #
    #     embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings,
    #                                                                          lbls_fixed)
    #     le = preprocessing.LabelEncoder()
    #     y = le.fit_transform(y)
    #
    #     tsne = TSNE()
    #     two_dimensional = tsne.fit_transform(embeddings)
    #
    #     #         pca = PCA(n_components=2)
    #     #         two_dimensional = pca.fit_transform(embeddings)
    #
    #     #         pca2 = PCA(n_components=20)
    #     #         pca_embeddings = pca2.fit_transform(embeddings)
    #
    #     clustering_algorithm.fit(two_dimensional)
    #     predictions = clustering_algorithm.predict(two_dimensional)
    #
    #     #        kmeans = KMeans(n_clusters=CLUSTERS)
    #     #        kmeans.fit(embeddings)
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.')
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')
    #
    # else:
    tsne = TSNE(n_components=2, init='pca')
    two_dimensional = tsne.fit_transform(original_embeddings)

    #         original_embeddings = scale((original_embeddings))

    #         pca = PCA(n_components=2)
    #         two_dimensional = pca.fit_transform(original_embeddings)

    #         pca2 = PCA(n_components=3)
    #         pca_embeddings = pca2.fit_transform(original_embeddings)

    clustering_algorithm.fit(two_dimensional)
    predictions = clustering_algorithm.predict(two_dimensional)

    #         kmeans = KMeans(n_clusters=CLUSTERS)
    #         kmeans.fit(two_dimensional)

    # plt.figure(figsize=(10,10))
    # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray')

    # plt.figure(figsize=(10,6))
    # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')

    #         plt.figure(figsize=(10,6))
    #         plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.')

    #         predictions = kmeans.labels_.tolist()

    for k, speech_index in enumerate(speech_indices[0]):
        frame_predictions[speech_index] = predictions[k]

    lbls = Util.generate_labels_from_classifications(frame_predictions,
                                                     timestamps)

    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"),
              'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))