Beispiel #1
0
    def test_get_annotated_data_x_y(self):
        timestamps = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
        data = np.random.rand(8, 10)
        labels = [
            AudacityLabel(1.2, 2.5, "m"),
            AudacityLabel(4.5, 6.2, "s")
        ]

        x, y, classes, timestamps = Util.get_annotated_data_x_y(timestamps, data, labels)

        self.assertEqual(3, x.shape[0])
        self.assertListEqual(["m", "s", "s"], y)
        self.assertListEqual(["m", "s"], classes)
        self.assertListEqual([0.0, 1.0, 2.0], timestamps)

        labels = [
            AudacityLabel(1.0, 5.5, 'A'),
            AudacityLabel(5.5, 10.0, 'B'),
            AudacityLabel(15.0, 20.5, 'C')
        ]

        X, y, classes, new_timestamps = Util.get_annotated_data_x_y([float(i) for i in range(0, 25)], np.ones((25, 10)),
                                                                    labels)

        self.assertListEqual(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C'], y)
        self.assertListEqual([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0], new_timestamps)
Beispiel #2
0
def multimodal_analysis(video_path, output, tmp_dir, duration, output_path):

    video_frames_dir = os.path.abspath(os.path.join(tmp_dir, 'video_frames'))

    if not os.path.isdir(video_frames_dir):
        extract_images_from_video(os.path.abspath(video_path),
                                  video_frames_dir)

    generate_face_based_segmentation(
        output, os.path.abspath(os.path.join(tmp_dir, 'video_frames')),
        os.path.abspath(output_path), args.speakers,
        os.path.abspath('models/shape_predictor_68_face_landmarks.dat'),
        os.path.abspath('models/dlib_face_recognition_resnet_model_v1.dat'),
        tmp_dir)

    # fusion
    mapping_face_to_voice = calculate_fusion(
        output,
        os.path.abspath(output_path),
        Util.read_audacity_labels(
            os.path.join(os.path.abspath(output_path),
                         '%s.audio.txt' % output)),
        Util.read_audacity_labels(
            os.path.join(os.path.abspath(output_path),
                         '%s.image.txt' % output)),
        duration,
        step=0.05,
        neighbours_before_after=40,
        times_greater=4)

    mapping_face_to_voice_json = os.path.join(
        os.path.abspath(output_path), output + ".mapping_face_to_voice.json")
    with open(mapping_face_to_voice_json, 'w') as f:
        json.dump(mapping_face_to_voice, f)
def main():
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--input-file', dest='input_file', required=True)
    args = parser.parse_args()

    input_dir = os.path.split(args.input_file)[0]
    temp_file = input_dir + "/temp.wav"

    cmd = [
        "/usr/bin/ffmpeg", "-i", args.input_file, "-ar", "22050", "-ac", "1",
        "-acodec", "pcm_s16le", temp_file, "-y"
    ]
    subprocess.check_call(cmd)

    cmd = ["yaafe", "-c", FEATURE_PLAN, "-r", "22050", temp_file]

    subprocess.check_output(cmd)

    features1 = ["zcr", "flux", "spectral_rollof", "energy_stats"]
    features2 = ["mfcc_stats"]
    features3 = ["spectral_flatness_per_band"]
    features4 = features1 + features2 + features3

    FEATURE_GROUPS = [features1, features2, features3, features4]

    peaks, convolution_values, timestamps = feat.get_combined_peaks(
        temp_file, FEATURE_GROUPS, kernel_type="gaussian")
    detected_segments = kernel.calculate_segment_start_end_times_from_peak_positions(
        peaks, timestamps)

    timestamps, feature_vectors = feat.read_features(features4,
                                                     temp_file,
                                                     scale=True)

    with open("/opt/speech-music-discrimination/pickled/model.pickle",
              'r') as f:
        trained_model = pickle.load(f)

    frame_level_predictions = trained_model.predict(feature_vectors)

    annotated_segments = Util.get_annotated_labels_from_predictions_and_sm_segments(
        frame_level_predictions, detected_segments, timestamps)

    annotated_segments = Util.combine_adjacent_labels_of_the_same_class(
        annotated_segments)
    annotated_segments = feat.filter_noisy_labels(annotated_segments)
    annotated_segments = Util.combine_adjacent_labels_of_the_same_class(
        annotated_segments)

    Util.write_audacity_labels(annotated_segments,
                               input_dir + "/annotated-segments.txt")

    for f in glob.glob(input_dir + "/*.csv"):
        os.remove(f)

    os.remove(temp_file)
Beispiel #4
0
    def combine_audio_segments(input_dir, output_name, output_dir, clear_output_dir):
        if clear_output_dir:
            Util.remove_dir(output_dir)
        Util.make_dir(output_dir)
        files_grouped_by_class = WavEditor.get_files_grouped_by_class(input_dir)

        for class_name in files_grouped_by_class:
            input_wavs = []
            for k, file in enumerate(files_grouped_by_class[class_name]):
                input_wavs.append(file)
            cmd = ['sox'] + input_wavs + [os.path.join(output_dir, '%s_%s.wav' % (output_name, class_name))]
            subprocess.check_call(cmd)
def main():

    subprocess.check_output(["unzip", "muspeak-mirex2015-detection-examples.zip", "-d",
                             "muspeak-mirex2015-detection-examples"])

    mp3_to_wav()

    wav_files = glob.glob("./muspeak-mirex2015-detection-examples/*.wav")

    for wav_file in wav_files:
        print wav_file
        label_file = wav_file.replace(".mp3.wav", ".csv")
        if not os.path.isfile(label_file):
            label_file = label_file.replace(".csv", "_v2.csv")
        WavEditor.create_audio_segments(label_file, wav_file, "segments", True, ",", "f2", remove_overlapping=True)

    speech_wavs = glob.glob("./segments/*_s.wav")
    music_wavs = glob.glob("./segments/*_m.wav")

    all_files_dict = {}

    for f in speech_wavs:
        all_files_dict[f] = "s"

    for f in music_wavs:
        all_files_dict[f] = "m"

    random.seed(2222)
    all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys()))

    last_seconds = 0
    files_to_concatenate = []

    labels = []
    for v in all_files_random_keys:
        duration = float(subprocess.check_output(["soxi", "-D", v]).strip())
        segment_start_time = last_seconds
        segment_end_time = last_seconds + duration
        last_seconds += duration
        labels.append(AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v]))
        files_to_concatenate.append(v)

    audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels)
    Util.write_audacity_labels(audacity_labels, "mirex_combined.txt")

    command = []
    command.append("sox")
    command.extend(files_to_concatenate)
    command.append("mirex_combined.wav")
    subprocess.check_output(command)

    shutil.rmtree("./segments")
    shutil.rmtree("./muspeak-mirex2015-detection-examples")
Beispiel #6
0
def calculate_der(reference_filename, hypothesis_filename):
    lbls = Util.read_audacity_labels(reference_filename)
    reference = Annotation()
    for lbl in lbls:
        reference[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label

    predicted_lbls = Util.read_audacity_labels(hypothesis_filename)
    hypothesis = Annotation()
    for lbl in predicted_lbls:
        if lbl.label != 'non_speech':
            hypothesis[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label

    metric = DiarizationErrorRate()
    der = metric(reference, hypothesis)
    return der
Beispiel #7
0
def get_features(features, datasets_dir, pca=False):
    timestamps_gtzan, feature_vectors_gtzan = Util.read_merged_features(
        datasets_dir + "/gtzan/gtzan_combined.wav", features)
    labels_gtzan = Util.read_audacity_labels(datasets_dir +
                                             "/gtzan/gtzan_combined.txt")
    X_gtzan, Y_gtzan, lbls_gtzan = Util.get_annotated_data_x_y(
        timestamps_gtzan, feature_vectors_gtzan, labels_gtzan)

    timestamps_labrosa, feature_vectors_labrosa = Util.read_merged_features(
        datasets_dir + "/labrosa/labrosa_combined.wav", features)
    labels_labrosa = Util.read_audacity_labels(datasets_dir +
                                               "/labrosa/labrosa_combined.txt")
    X_labrosa, Y_labrosa, lbls_labrosa = Util.get_annotated_data_x_y(
        timestamps_labrosa, feature_vectors_labrosa, labels_labrosa)

    timestamps_mirex, feature_vectors_mirex = Util.read_merged_features(
        datasets_dir + "/mirex/mirex_combined.wav", features)
    labels_mirex = Util.read_audacity_labels(datasets_dir +
                                             "/mirex/mirex_combined.txt")
    X_mirex, Y_mirex, lbls_mirex = Util.get_annotated_data_x_y(
        timestamps_mirex, feature_vectors_mirex, labels_mirex)

    scaler = StandardScaler()
    scaler.fit(np.concatenate((X_labrosa, X_gtzan, X_mirex)))
    with open("pickled/scaler.pickle", 'w') as f:
        pickle.dump(scaler, f)
    X_gtzan = scaler.transform(X_gtzan)
    X_labrosa = scaler.transform(X_labrosa)
    X_mirex = scaler.transform(X_mirex)

    if pca:
        pca = PCA(n_components=20)
        pca.fit(np.concatenate((X_labrosa, X_gtzan, X_mirex)))
        X_gtzan = pca.transform(X_gtzan)
        X_labrosa = pca.transform(X_labrosa)
        X_mirex = pca.transform(X_mirex)

    data = {
        "x_gtzan": X_gtzan,
        "y_gtzan": Y_gtzan,
        "labels_gtzan": labels_gtzan,
        "x_labrosa": X_labrosa,
        "y_labrosa": Y_labrosa,
        "labels_labrosa": labels_labrosa,
        "x_mirex": X_mirex,
        "y_mirex": Y_mirex,
        "labels_mirex": labels_mirex,
        "timestamps_gtzan": timestamps_gtzan,
        "timestamps_labrosa": timestamps_labrosa,
        "timestamps_mirex": timestamps_mirex
    }

    return data
Beispiel #8
0
    def test_non_maximum_suppression(self):
        peaks = [2, 3, 4, 7, 8, 9]
        values = [2, 2, 2, 3, 2, 2, 2, 4, 5, 4, 2, 2, 2]
        p, v = Util.non_maximum_suppression(peaks, values)

        self.assertListEqual([3, 8], p)
        self.assertListEqual(values, v)
Beispiel #9
0
    def test_get_annotation_time_shift(self):

        labelsA = [
            AudacityLabel(1.0, 5.0, 'A'),
            AudacityLabel(5.0, 10.0, 'B'),
            AudacityLabel(15.0, 20.0, 'B')
        ]

        new_labels = Util.get_annotation_time_shift(labelsA)

        expected = [
                {
                    'old_label': AudacityLabel(1.0, 5.0, 'A'),
                    'new_label': AudacityLabel(0.0, 4.0, 'A'),
                    'shift': 1.0
                },
                {
                    'old_label': AudacityLabel(5.0, 10.0, 'B'),
                    'new_label': AudacityLabel(4.0, 9.0, 'B'),
                    'shift': 1.0
                },
                {
                    'old_label': AudacityLabel(15.0, 20.0, 'B'),
                    'new_label': AudacityLabel(9.0, 14.0, 'B'),
                    'shift': 6.0
                }
            ]

        self.assertListEqual(expected, new_labels)
Beispiel #10
0
    def test_get_unshifted_labels(self):

        predicted_lbls = [
            AudacityLabel(1.0, 3.0, "A"),
            AudacityLabel(8.0, 12.0, "B")
        ]

        shifted_unshifted_labels = [
            {
                'old_label': AudacityLabel(1.0, 5.0, 'A'),
                'new_label': AudacityLabel(0.0, 4.0, 'A'),
                'shift': 1.0
            },
            {
                'old_label': AudacityLabel(5.0, 10.0, 'B'),
                'new_label': AudacityLabel(4.0, 9.0, 'B'),
                'shift': 1.0
            },
            {
                'old_label': AudacityLabel(15.0, 20.0, 'B'),
                'new_label': AudacityLabel(9.0, 14.0, 'B'),
                'shift': 6.0
            }
        ]

        lbls = Util.get_unshifted_labels(predicted_lbls, shifted_unshifted_labels)

        self.assertListEqual([AudacityLabel(2.0, 4.0, 'A'), AudacityLabel(9.0, 18.0, 'B')], lbls)
Beispiel #11
0
    def test_get_unshifted_timestamps(self):

        lbls = [
            {
                'old_label': AudacityLabel(1.0, 5.0, 'A'),
                'new_label': AudacityLabel(0.0, 4.0, 'A'),
                'shift': 1.0
            },
            {
                'old_label': AudacityLabel(5.0, 10.0, 'B'),
                'new_label': AudacityLabel(4.0, 9.0, 'B'),
                'shift': 1.0
            },
            {
                'old_label': AudacityLabel(15.0, 20.0, 'B'),
                'new_label': AudacityLabel(9.0, 14.0, 'B'),
                'shift': 6.0
            }
        ]

        shifted_timestamps = [3.0, 4.0, 11.0]
        expected_unshifted_timestamps = [
            shifted_timestamps[0] + lbls[0]['shift'],
            shifted_timestamps[1] + lbls[1]['shift'],
            shifted_timestamps[2] + lbls[2]['shift']
        ]
        unshifted_timestamps = Util.get_unshifted_timestamps(shifted_timestamps, lbls)
        self.assertListEqual(expected_unshifted_timestamps, unshifted_timestamps)
Beispiel #12
0
def read_features(features, wavfile, scale=False):
    timestamps, feature_vectors = Util.read_merged_features(wavfile, features)
    if scale:
        with open("/opt/speech-music-discrimination/pickled/scaler.pickle",
                  'r') as f:
            scaler = pickle.load(f)
        feature_vectors = scaler.transform(feature_vectors)
    return timestamps, feature_vectors
Beispiel #13
0
    def test_combine_peaks(self):

        a_peaks = [3, 8]
        a_peak_values = [2, 2, 2, 3, 2, 2, 2, 4, 5, 4, 2, 2, 2]

        b_peaks = [6]
        b_peak_values = [2, 2, 2, 2, 2, 2, 7, 2, 2, 2, 2, 2, 2]

        p, v = Util.combine_peaks(a_peaks, a_peak_values, b_peaks, b_peak_values)

        self.assertListEqual([3, 6, 8], p)
Beispiel #14
0
    def test_get_annotated_labels_from_predictions_and_sm_segments(self):

        timestamps = [0, 1, 2, 3, 4, 5, 6]
        segments = [
            AudacityLabel(0, 2.5, '-'),
            AudacityLabel(2.5, 5.5, '-')
        ]
        frame_level_predictions = np.array(['v', 'v', 'v', 's', 's', 'v', 'v'])

        labels = Util.get_annotated_labels_from_predictions_and_sm_segments(frame_level_predictions, segments, timestamps)

        self.assertListEqual(['v', 's'], [l.label for l in labels])
Beispiel #15
0
    def get_files_grouped_by_class(input_dir):
        input_dir = os.path.abspath(input_dir)
        files = os.listdir(input_dir)
        files_grouped_by_class = {}

        for file in files:
            file_class = Util.get_class_from_filename(file)
            if not file_class in files_grouped_by_class:
                files_grouped_by_class[file_class] = []
            files_grouped_by_class[file_class].append(os.path.join(input_dir, file))

        return files_grouped_by_class
def calculate_fusion(youtube_video_id,
                     lbls_dir,
                     audio_lbls,
                     image_lbls,
                     duration,
                     step=0.1,
                     neighbours_before_after=6,
                     times_greater=2):  # 100ms

    pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step)
    mapping_face_to_voice = detect_face_voice_mapping(pairs)
    # print(mapping_face_to_voice)
    pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice)
    # print(pairs)
    seconds_of_mismatch = 0
    for k, pair in enumerate(pairs):
        if pair.image_class is None:
            # when image is None
            continue
        classes = pair.image_class.split(",")
        # if only one face has been detected then assume it's the face of the speaker
        if len(classes) == 1 and pair.audio_class != 'non_speech':
            if pair.image_class != pair.audio_class:
                seconds_of_mismatch += step
                # print("%s != %s" % (pair.image_class, pair.audio_class))
                nearest_neighbour_class = find_nearest_neighbours_class(
                    k,
                    pairs,
                    neighbours_before_after=neighbours_before_after,
                    times_greater=times_greater)
                pair.audio_class = nearest_neighbour_class

    lbls = Util.generate_labels_from_classifications(
        [p.audio_class for p in pairs], timestamps)
    lbls = list(filter(lambda x: x.label is not None, lbls))

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".fusion.txt"))
    return mapping_face_to_voice
Beispiel #17
0
    def test_split_data_based_on_annotation(self):
        X = np.array([
            [1, 2, 3, 4],
            [2, 3, 4, 5],
            [4, 5, 6, 7]
        ])
        Y = [0, 0, 1]
        classes = ["music", "speech"]

        data = Util.split_data_based_on_annotation(X, Y, classes)

        self.assertEqual(data["music"].shape[0], 2)
        self.assertEqual(data["speech"].shape[0], 1)
Beispiel #18
0
    def test_get_annotated_data(self):
        timestamps = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
        data = np.random.rand(7, 10)
        labels = [
            AudacityLabel(1.2, 2.5, "m"),
            AudacityLabel(4.5, 6.0, "s")
        ]

        annotated_data = Util.get_annotated_data(timestamps, data, labels)
        self.assertTrue("m" in annotated_data)
        self.assertTrue("s" in annotated_data)
        self.assertTrue(data[2, :] in annotated_data["m"])
        self.assertTrue(data[5, :] in annotated_data["s"])
        self.assertTrue(data[6, :] in annotated_data["s"])
Beispiel #19
0
    def create_audio_segments(labels, input_wav, output_dir, clear_output_dir, delimiter, format, remove_overlapping=False):
        if clear_output_dir:
            Util.remove_dir(output_dir)
        Util.make_dir(output_dir)

        rows = WavEditor.get_rows(labels, delimiter)
        if format == "f2" and remove_overlapping:
            rows = WavEditor.get_non_overlapping_items(rows)

        for k, row in enumerate(rows):
            if format == "f1" or (format == "f2" and remove_overlapping):
                start_time = row[0]
                end_time = row[1]
            elif format == "f2":
                start_time = row[0]
                end_time = str(float(row[0]) + float(row[1]))
            else:
                logging.error("not supported file format")
                sys.exit(1)

            label = row[2]
            filename = str(uuid.uuid4())
            WavEditor.create_audio_segment(start_time, end_time, input_wav,
                                           os.path.join(output_dir, "%s_%s.wav" % (filename, label)))
Beispiel #20
0
def calculate_fusion(youtube_video_id, lbls_dir, audio_lbls, image_lbls, duration, step=0.1): # 100ms

    pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step)
    mapping_face_to_voice = detect_face_voice_mapping(pairs)
    print(mapping_face_to_voice)
    pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice)
    print(pairs)

    for k, pair in enumerate(pairs):
        if pair.image_class is None:
            # when image is None
            continue
        classes = pair.image_class.split(",")
        if len(classes) == 1 and pair.audio_class != 'non_speech':
            if pair.image_class != pair.audio_class:
                # print("%s != %s" % (pair.image_class, pair.audio_class))
                nearest_neighbour_class = find_nearest_neighbours_class(k, pairs)
                pair.audio_class = nearest_neighbour_class

    print(pairs)

    lbls = Util.generate_labels_from_classifications([p.audio_class for p in pairs], timestamps)
    lbls = filter(lambda x: x.label is not None, lbls)

    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))
    return mapping_face_to_voice
Beispiel #21
0
    def test_combine_adjacent_labels_of_the_same_class(self):
        input_labels = [
            AudacityLabel(0, 10, "m"),
            AudacityLabel(10, 20, "m"),
            AudacityLabel(20, 21, "s"),
            AudacityLabel(21, 22, "s"),
            AudacityLabel(22, 23, "s"),
            AudacityLabel(23, 30, "m")
        ]
        expected_labels = [
            AudacityLabel(0, 20, "m"),
            AudacityLabel(20, 23, "s"),
            AudacityLabel(23, 30, "m"),
        ]

        actual_labels = Util.combine_adjacent_labels_of_the_same_class(input_labels)

        self.assertListEqual(expected_labels, actual_labels)
Beispiel #22
0
    def test_generate_labels_from_classifications(self):
        classifications = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1]
        timestamps = [0.0, 0.6965986394557823, 1.3931972789115645, 2.089795918367347, 2.786394557823129,
                      3.4829931972789114, 4.179591836734694, 4.876190476190477, 5.572789115646258, 6.2693877551020405,
                      6.965986394557823, 7.662585034013605, 8.359183673469389, 9.05578231292517, 9.752380952380953,
                      10.448979591836734, 11.145578231292516, 11.842176870748299, 12.538775510204081,
                      13.235374149659863, 13.931972789115646, 14.628571428571428, 15.32517006802721, 16.021768707482995]

        labels = Util.generate_labels_from_classifications(classifications, timestamps)

        expected_labels = [AudacityLabel(0.0, 1.3931972789115645, 1),
                           AudacityLabel(1.3931972789115645, 9.752380952380953, 0),
                           AudacityLabel(9.752380952380953, 10.448979591836736, 1),
                           AudacityLabel(10.448979591836734, 11.145578231292516, 0),
                           AudacityLabel(11.145578231292516, 12.538775510204081, 1),
                           AudacityLabel(12.538775510204081, 13.235374149659863, 0),
                           AudacityLabel(13.235374149659863, 16.718367346938777, 1)]

        self.assertListEqual(expected_labels, labels)
Beispiel #23
0
 def test_load_yaafe_csv_double_stats(self):
     timestamps, features = Util.load_yaafe_csv(self.double_stats_csv)
     self.assertEqual((17, 2), features.shape)
     self.assertEqual(334.3673469387755, timestamps[-1])
Beispiel #24
0
 def test_load_yaafe_csv_stats(self):
     timestamps, features = Util.load_yaafe_csv(self.stats_csv)
     self.assertEqual((17, 2), features.shape)
     self.assertEqual(11.145578231292516, timestamps[-1])
Beispiel #25
0
 def test_load_yaafe_csv_no_stats(self):
     timestamps, features = Util.load_yaafe_csv(self.no_stats_csv)
     self.assertEqual((17, 2), features.shape)
     self.assertEqual(0.37151927437641724, timestamps[-1])
Beispiel #26
0
 def test_parse_yaafe_header_stats_derivate(self):
     header = Util.parse_yaafe_header(self.stats_derivate)
     self.assertEqual(22050, header['samplerate'])
     self.assertEqual(15360, header['effective_step_size'])
Beispiel #27
0
    def test_calculate_classes_percentages(self):
        classifications = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1]
        percentages = Util.calculate_classes_percentages(classifications)

        self.assertAlmostEqual(0.5833333333333334, percentages[0])
        self.assertAlmostEqual(0.4166666666666667, percentages[1])
for f in music_wavs:
    all_files_dict[f] = "m"

random.seed(1111)
all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys()))

last_seconds = 0
files_to_concatenate = []

labels = []
for v in all_files_random_keys:
    duration = float(subprocess.check_output(["soxi", "-D", v]).strip())
    segment_start_time = last_seconds
    segment_end_time = last_seconds + duration
    last_seconds += duration
    labels.append(AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v]))
    files_to_concatenate.append(v)

audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels)
Util.write_audacity_labels(audacity_labels, "gtzan_combined.txt")

command = []
command.append("sox")
command.extend(files_to_concatenate)
command.append("gtzan_combined.wav")
subprocess.check_output(command)

subprocess.call(['chmod', '-R', '777', './music_speech'])
shutil.rmtree("./music_speech")
Beispiel #29
0
def x(youtube_video_id):
    job = Job.query.filter_by(video_id=youtube_video_id).first()
    if job is not None:
        try:
            job.start_time = datetime.utcnow()
            subprocess.check_call([
                'youtube-dl', '-f', '18', '--write-thumbnail', '-o',
                'videos/%(id)s.%(ext)s',
                'https://www.youtube.com/watch?v=%s' % youtube_video_id
            ])
            copyfile('videos/%s.jpg' % youtube_video_id,
                     'static/img/thumbs/%s.jpg' % youtube_video_id)
            set_state(State.VIDEO_DOWNLOADED, db, job)

            subprocess.check_call([
                'ffmpeg', '-y', '-i',
                'videos/%s.mp4' % youtube_video_id, '-ar', '16000', '-ac', '1',
                'audios/%s.wav' % youtube_video_id
            ])
            set_state(State.AUDIO_EXTRACTED, db, job)

            y, sr = librosa.load("audios/%s.wav" % youtube_video_id, sr=16000)
            D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
            D = np.flipud(D)
            D8 = (((D - D.min()) / (D.max() - D.min())) * 255.9).astype(
                np.uint8)
            img = Image.fromarray(D8)
            img = img.resize((D.shape[1], 64))
            img.save("static/img/waveforms/%s.jpg" % youtube_video_id)
            duration = librosa.get_duration(y=y, sr=sr)
            job.waveform_width = D.shape[1]
            job.duration = duration
            set_state(State.WAVEFORM_GENERATED, db, job)

            if duration > 600:
                raise Exception("Video duration greated than 10 min (600 sec)")

            # audio based segmentation

            generate_audio_based_segmentation(
                os.path.abspath('audios/%s.wav' % youtube_video_id),
                15,
                20,
                256,
                128,
                0.2,
                os.path.abspath('models/weights.h5'),
                os.path.abspath('models/scaler.pickle'),
                1024,
                3,
                1024,
                youtube_video_id,
                os.path.abspath('static/lbls/audio'),
                clusters=job.number_of_speakers)
            set_state(State.AUDIO_DATA_ANALYSED, db, job)

            steps.mfcc.generate_audio_based_segmentation(
                os.path.abspath('audios/%s.wav' % youtube_video_id),
                15,
                20,
                256,
                128,
                0.2,
                os.path.abspath('models/weights.h5'),
                os.path.abspath('models/scaler.pickle'),
                1024,
                3,
                1024,
                youtube_video_id,
                os.path.abspath('static/lbls/mfcc'),
                clusters=job.number_of_speakers)

            set_state(State.MFCC_ANALYSED, db, job)

            # face based segmentation
            extract_images_from_video(
                os.path.abspath('videos/%s.mp4' % youtube_video_id),
                os.path.abspath('video_frames'))
            generate_face_based_segmentation(
                youtube_video_id,
                os.path.abspath('video_frames/%s' % youtube_video_id),
                os.path.abspath('static/lbls/image'), job.number_of_speakers,
                os.path.abspath(
                    'models/shape_predictor_68_face_landmarks.dat'),
                os.path.abspath(
                    'models/dlib_face_recognition_resnet_model_v1.dat'))
            set_state(State.IMAGE_DATA_ANALYSED, db, job)

            # fusion
            mapping_face_to_voice = calculate_fusion(
                youtube_video_id, os.path.abspath('static/lbls/fusion'),
                Util.read_audacity_labels(
                    os.path.abspath('static/lbls/audio/%s.txt' %
                                    youtube_video_id)),
                Util.read_audacity_labels(
                    os.path.abspath('static/lbls/image/%s.txt' %
                                    youtube_video_id)), duration)
            job.mapping_face_to_voice = mapping_face_to_voice
            set_state(State.FUSION_APPLIED, db, job)

            job.end_time = datetime.utcnow()
            set_state(State.DONE, db, job)

        except Exception as e:
            print(e)
            job.end_time = datetime.utcnow()
            set_state(State.ERROR, db, job, str(e))
            raise e
Beispiel #30
0
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir,
                                     faces, predictor_path,
                                     face_rec_model_path, tmp_dir):

    images_raw = glob(os.path.join(images_dir, "*.jpg"))
    images_raw.sort()
    # images_raw = images_raw[0:100]
    images = [
        images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP)
    ]
    print(images)
    timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))]
    print(timestamps)

    detector = get_frontal_face_detector()
    sp = dlib.shape_predictor(predictor_path)
    facerec = face_recognition_model_v1(face_rec_model_path)

    embeddings = []
    embeddings_timestamps = []
    landmarks_parts = []
    landmarks_rect = []

    embeddings_pickle = os.path.join(tmp_dir, "embeddings.npy")
    embeddings_timestamps_pickle = os.path.join(tmp_dir,
                                                "embeddings_timestamps.npy")

    if not os.path.isfile(embeddings_pickle) or not os.path.isfile(
            embeddings_timestamps_pickle):

        for frame_no, f in enumerate(images):
            print("Processing file: {}".format(f))
            img = io.imread(f)

            dets = detector(img, 1)
            print("Number of faces detected: {}".format(len(dets)))

            for k, d in enumerate(dets):
                shape = sp(img, d)
                face_descriptor = facerec.compute_face_descriptor(img, shape)
                embeddings.append(face_descriptor)
                embeddings_timestamps.append(timestamps[frame_no])
                landmarks_parts.append(shape.parts())
                landmarks_rect.append(shape.rect)

        embeddings = numpy.array(embeddings)
        embeddings_timestamps = numpy.array(embeddings_timestamps)
        numpy.save(embeddings_pickle, embeddings)
        numpy.save(embeddings_timestamps_pickle, embeddings_timestamps)
    else:
        embeddings = numpy.load(embeddings_pickle)
        embeddings_timestamps = numpy.load(embeddings_timestamps_pickle)

    print(embeddings.shape)
    print(embeddings_timestamps.shape)

    if len(embeddings) == 0:
        Util.write_audacity_labels([],
                                   os.path.join(
                                       lbls_dir,
                                       youtube_video_id + ".image.txt"))
        return

    kmeans = KMeans(n_clusters=faces)
    kmeans.fit(embeddings)

    predictions = numpy.array(kmeans.labels_.tolist())
    df = pd.DataFrame({
        "timestamps": embeddings_timestamps.tolist(),
        "predictions": predictions
    })

    timestamps = []
    classes = []

    for key, group in df.groupby(['timestamps']):
        timestamps.append(key)
        classes.append(",".join(
            [str(i) for i in sorted(group['predictions'].tolist())]))

    lbls = Util.generate_labels_from_classifications(classes, timestamps)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".image.txt"))
import itertools
import pickle
import numpy as np
from sac.util import Util
import os
import feat
from sklearn.svm import SVC

DATASETS = os.path.abspath("../datasets")

features = Util.read_feature_names_from_file(
    os.path.join(DATASETS, "featureplans/featureplan"))

features1 = ["zcr", "flux", "spectral_rollof", "energy_stats"]
features2 = ["mfcc_stats"]
features3 = ["spectral_flatness_per_band"]
features4 = features1 + features2 + features3

data = feat.get_features(features4, DATASETS, pca=False)

TRAIN = ["mirex", "labrosa", "gtzan"]

model = SVC()
X = np.vstack((data["x_" + i] for i in TRAIN))
Y = list(itertools.chain.from_iterable([data["y_" + i] for i in TRAIN]))

model.fit(X, Y)

with open("pickled/model.pickle", 'w') as f:
    pickle.dump(model, f)
import itertools
import pickle
import numpy as np
from sac.util import Util
import os
import feat
from sklearn.svm import SVC

DATASETS = os.path.abspath("../datasets")

features = Util.read_feature_names_from_file(os.path.join(DATASETS, "featureplans/featureplan"))

features1 = ["zcr", "flux", "spectral_rollof", "energy_stats"]
features2 = ["mfcc_stats"]
features3 = ["spectral_flatness_per_band"]
features4 = features1 + features2 + features3

data = feat.get_features(features4, DATASETS, pca=False)

TRAIN = ["mirex", "labrosa", "gtzan"]

model = SVC()
X = np.vstack((data["x_" + i] for i in TRAIN))
Y = list(itertools.chain.from_iterable([data["y_" + i] for i in TRAIN]))

model.fit(X, Y)

with open("pickled/model.pickle", 'w') as f:
    pickle.dump(model, f)
Beispiel #33
0
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir,
                                     faces, predictor_path,
                                     face_rec_model_path):

    images_raw = glob(os.path.join(images_dir, "*.jpg"))
    images_raw.sort()
    # images_raw = images_raw[0:100]
    images = [
        images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP)
    ]
    print(images)
    timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))]
    print(timestamps)

    detector = dlib.get_frontal_face_detector()
    sp = dlib.shape_predictor(predictor_path)
    facerec = dlib.face_recognition_model_v1(face_rec_model_path)

    embeddings = []
    embeddings_timestamps = []
    landmarks_parts = []
    landmarks_rect = []

    for frame_no, f in enumerate(images):
        print("Processing file: {}".format(f))
        img = io.imread(f)

        dets = detector(img, 1)
        print("Number of faces detected: {}".format(len(dets)))

        for k, d in enumerate(dets):
            shape = sp(img, d)
            face_descriptor = facerec.compute_face_descriptor(img, shape)
            embeddings.append(face_descriptor)
            embeddings_timestamps.append(timestamps[frame_no])
            landmarks_parts.append(shape.parts())
            landmarks_rect.append(shape.rect)

    embeddings = numpy.array(embeddings)
    embeddings_timestamps = numpy.array(embeddings_timestamps)

    print(embeddings.shape)
    print(embeddings_timestamps.shape)

    if len(embeddings) == 0:
        Util.write_audacity_labels([],
                                   os.path.join(lbls_dir,
                                                youtube_video_id + ".txt"))
        return

    kmeans = KMeans(n_clusters=faces)
    kmeans.fit(embeddings)

    predictions = numpy.array(kmeans.labels_.tolist())
    df = pd.DataFrame({
        "timestamps": embeddings_timestamps.tolist(),
        "predictions": predictions
    })

    timestamps = []
    classes = []

    for key, group in df.groupby(['timestamps']):
        timestamps.append(key)
        classes.append(",".join(
            [str(i) for i in sorted(group['predictions'].tolist())]))

    lbls = Util.generate_labels_from_classifications(classes, timestamps)
    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"),
              'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))


# if __name__ == '__main__':
#
#     extract_images_from_video("/Users/nicktgr15/workspace/speaker_diarisation_poc/src/videos/Unamij6z1io.mp4",
#                               "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames")
#
#     generate_face_based_segmentation(
#         "Unamij6z1io",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames/Unamij6z1io",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/image",
#         4,
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/shape_predictor_68_face_landmarks.dat",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/dlib_face_recognition_resnet_model_v1.dat"
#     )
def generate_audio_based_segmentation(audio_file,
                                      w,
                                      h,
                                      embedding_size,
                                      lstm_nodes,
                                      dropout,
                                      weights_filename,
                                      scaler_filename,
                                      window_size,
                                      step,
                                      hop_size,
                                      youtube_video_id,
                                      lbls_dir,
                                      clusters=4,
                                      sr=16000):
    vad = Vad()
    vad_lbls = vad.detect_voice_segments(audio_file)
    with open(scaler_filename, 'rb') as f:
        scaler = pickle.load(f)

    model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes,
                                           dropout)
    model.load_weights(weights_filename)
    feature_extractor = FeatExtractorMFCC(window_size,
                                          hop_size,
                                          w,
                                          sr,
                                          h,
                                          step=step)
    X, timestamps = feature_extractor.extract(audio_file)
    timestamps = numpy.array(timestamps)
    window = timestamps[1] - timestamps[0]

    frame_predictions = []

    for k, timestamp in enumerate(timestamps):
        found = False
        for lbl in vad_lbls:
            if lbl.start_seconds <= timestamp <= lbl.end_seconds - window:  # need the window end to fit in the label
                frame_predictions.append(lbl.label)
                found = True
                break
        if not found:
            frame_predictions.append('non_speech')

    frame_predictions = numpy.array(frame_predictions)
    print(frame_predictions.shape)
    print(timestamps.shape)

    speech_indices = numpy.where(frame_predictions == 'speech')

    X_speech = X[speech_indices]

    X = X_speech.reshape((X_speech.shape[0] * w, h))
    X = scaler.transform(X)
    X = X.reshape(-1, w, h)

    original_embeddings = intermediate.predict(X)

    clustering_algorithm = KMeans(n_clusters=clusters)

    reducted_embeddings = original_embeddings
    predictions = clustering_algorithm.fit_predict(reducted_embeddings)

    for k, speech_index in enumerate(speech_indices[0]):
        frame_predictions[speech_index] = predictions[k]

    lbls = Util.generate_labels_from_classifications(frame_predictions,
                                                     timestamps)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".audio.txt"))
    all_files_dict[f] = "m"

random.seed(1111)
all_files_random_keys = random.sample(all_files_dict.keys(),
                                      len(all_files_dict.keys()))

last_seconds = 0
files_to_concatenate = []

labels = []
for v in all_files_random_keys:
    duration = float(subprocess.check_output(["soxi", "-D", v]).strip())
    segment_start_time = last_seconds
    segment_end_time = last_seconds + duration
    last_seconds += duration
    labels.append(
        AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v]))
    files_to_concatenate.append(v)

audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels)
Util.write_audacity_labels(audacity_labels, "gtzan_combined.txt")

command = []
command.append("sox")
command.extend(files_to_concatenate)
command.append("gtzan_combined.wav")
subprocess.check_output(command)

subprocess.call(['chmod', '-R', '777', './music_speech'])
shutil.rmtree("./music_speech")
Beispiel #36
0
 def test_parse_yaafe_header_double_stats(self):
     header = Util.parse_yaafe_header(self.double_stats_csv)
     self.assertEqual(22050, header['samplerate'])
     self.assertEqual(460800, header['effective_step_size'])
Beispiel #37
0
def generate_audio_based_segmentation(audio_file,
                                      w,
                                      h,
                                      embedding_size,
                                      lstm_nodes,
                                      dropout,
                                      weights_filename,
                                      scaler_filename,
                                      window_size,
                                      step,
                                      hop_size,
                                      youtube_video_id,
                                      lbls_dir,
                                      clusters=4,
                                      sr=16000):
    vad = Vad()
    vad_lbls = vad.detect_voice_segments(audio_file)

    # model.load_weights(weights_filename)
    # feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step)
    # X, timestamps = feature_extractor.extract(audio_file)
    # timestamps = numpy.array(timestamps)

    y, sr = librosa.load(audio_file, sr=sr)
    X = mfcc(y, sr=sr, n_mfcc=h, n_fft=window_size, hop_length=hop_size)

    timestamps = [k * hop_size / sr for k in range(0, X.shape[1])]
    timestamps = numpy.array(timestamps)
    window = timestamps[1] - timestamps[0]

    frame_predictions = []

    for k, timestamp in enumerate(timestamps):
        found = False
        for lbl in vad_lbls:
            if lbl.start_seconds <= timestamp <= lbl.end_seconds - window:  # need the window end to fit in the label
                frame_predictions.append(lbl.label)
                found = True
                break
        if not found:
            frame_predictions.append('non_speech')

    frame_predictions = numpy.array(frame_predictions)

    speech_indices = numpy.where(frame_predictions == 'speech')

    X_speech = X[:, speech_indices]
    X_speech = X_speech.reshape((X_speech.shape[0], X_speech.shape[2]))
    print(X_speech.shape)
    X_speech = X_speech.transpose()
    print(X_speech.shape)
    timestamps_speech = timestamps[speech_indices]

    # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps),
    #                            "vad_preds_quant.txt")

    # X = X_speech.reshape((X_speech.shape[0] * w, h))
    # X = scaler.transform(X)
    # X = X.reshape(-1, w, h)

    # original_embeddings = intermediate.predict(X)

    clustering_algorithm = GaussianMixture(n_components=clusters,
                                           max_iter=1000,
                                           n_init=3)

    # if visualise:
    #
    #     embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings,
    #                                                                          lbls_fixed)
    #     le = preprocessing.LabelEncoder()
    #     y = le.fit_transform(y)
    #
    #     tsne = TSNE()
    #     two_dimensional = tsne.fit_transform(embeddings)
    #
    #     #         pca = PCA(n_components=2)
    #     #         two_dimensional = pca.fit_transform(embeddings)
    #
    #     #         pca2 = PCA(n_components=20)
    #     #         pca_embeddings = pca2.fit_transform(embeddings)
    #
    #     clustering_algorithm.fit(two_dimensional)
    #     predictions = clustering_algorithm.predict(two_dimensional)
    #
    #     #        kmeans = KMeans(n_clusters=CLUSTERS)
    #     #        kmeans.fit(embeddings)
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.')
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')
    #
    # else:
    tsne = TSNE(n_components=2, init='pca')
    two_dimensional = tsne.fit_transform(X_speech)

    #         original_embeddings = scale((original_embeddings))

    #         pca = PCA(n_components=2)
    #         two_dimensional = pca.fit_transform(original_embeddings)

    #         pca2 = PCA(n_components=3)
    #         pca_embeddings = pca2.fit_transform(original_embeddings)

    clustering_algorithm.fit(two_dimensional)
    predictions = clustering_algorithm.predict(two_dimensional)

    #         kmeans = KMeans(n_clusters=CLUSTERS)
    #         kmeans.fit(two_dimensional)

    # plt.figure(figsize=(10,10))
    # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray')

    # plt.figure(figsize=(10,6))
    # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')

    #         plt.figure(figsize=(10,6))
    #         plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.')

    #         predictions = kmeans.labels_.tolist()

    for k, speech_index in enumerate(speech_indices[0]):
        frame_predictions[speech_index] = predictions[k]

    lbls = Util.generate_labels_from_classifications(frame_predictions,
                                                     timestamps)

    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"),
              'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))


# if __name__ == '__main__':
#     generate_audio_based_segmentation(
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/audios/Unamij6z1io.wav",
#         15, 20, 256, 128, 0.2,
#         os.path.abspath('models/weights.h5'),
#         os.path.abspath('models/scaler.pickle'),
#         1024, 3, 1024, "xxx",
#         "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/mfcc",
#         clusters=4
#     )
def generate_audio_based_segmentation(audio_file,
                                      w,
                                      h,
                                      embedding_size,
                                      lstm_nodes,
                                      dropout,
                                      weights_filename,
                                      scaler_filename,
                                      window_size,
                                      step,
                                      hop_size,
                                      youtube_video_id,
                                      lbls_dir,
                                      clusters=4,
                                      sr=16000):
    vad = Vad()
    vad_lbls = vad.detect_voice_segments(audio_file)
    with open(scaler_filename, 'rb') as f:
        scaler = pickle.load(f)

    model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes,
                                           dropout)
    model.load_weights(weights_filename)
    feature_extractor = FeatExtractorMFCC(window_size,
                                          hop_size,
                                          w,
                                          sr,
                                          h,
                                          step=step)
    X, timestamps = feature_extractor.extract(audio_file)
    timestamps = numpy.array(timestamps)
    window = timestamps[1] - timestamps[0]

    frame_predictions = []

    for k, timestamp in enumerate(timestamps):
        found = False
        for lbl in vad_lbls:
            if lbl.start_seconds <= timestamp <= lbl.end_seconds - window:  # need the window end to fit in the label
                frame_predictions.append(lbl.label)
                found = True
                break
        if not found:
            frame_predictions.append('non_speech')

    frame_predictions = numpy.array(frame_predictions)
    print(frame_predictions.shape)
    print(timestamps.shape)

    speech_indices = numpy.where(frame_predictions == 'speech')

    X_speech = X[speech_indices]
    timestamps_speech = timestamps[speech_indices]

    # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps),
    #                            "vad_preds_quant.txt")

    X = X_speech.reshape((X_speech.shape[0] * w, h))
    X = scaler.transform(X)
    X = X.reshape(-1, w, h)

    original_embeddings = intermediate.predict(X)

    clustering_algorithm = GaussianMixture(n_components=clusters,
                                           max_iter=1000,
                                           n_init=3)

    # if visualise:
    #
    #     embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings,
    #                                                                          lbls_fixed)
    #     le = preprocessing.LabelEncoder()
    #     y = le.fit_transform(y)
    #
    #     tsne = TSNE()
    #     two_dimensional = tsne.fit_transform(embeddings)
    #
    #     #         pca = PCA(n_components=2)
    #     #         two_dimensional = pca.fit_transform(embeddings)
    #
    #     #         pca2 = PCA(n_components=20)
    #     #         pca_embeddings = pca2.fit_transform(embeddings)
    #
    #     clustering_algorithm.fit(two_dimensional)
    #     predictions = clustering_algorithm.predict(two_dimensional)
    #
    #     #        kmeans = KMeans(n_clusters=CLUSTERS)
    #     #        kmeans.fit(embeddings)
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.')
    #
    #     plt.figure(figsize=(10, 6))
    #     plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')
    #
    # else:
    tsne = TSNE(n_components=2, init='pca')
    two_dimensional = tsne.fit_transform(original_embeddings)

    #         original_embeddings = scale((original_embeddings))

    #         pca = PCA(n_components=2)
    #         two_dimensional = pca.fit_transform(original_embeddings)

    #         pca2 = PCA(n_components=3)
    #         pca_embeddings = pca2.fit_transform(original_embeddings)

    clustering_algorithm.fit(two_dimensional)
    predictions = clustering_algorithm.predict(two_dimensional)

    #         kmeans = KMeans(n_clusters=CLUSTERS)
    #         kmeans.fit(two_dimensional)

    # plt.figure(figsize=(10,10))
    # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray')

    # plt.figure(figsize=(10,6))
    # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.')

    #         plt.figure(figsize=(10,6))
    #         plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.')

    #         predictions = kmeans.labels_.tolist()

    for k, speech_index in enumerate(speech_indices[0]):
        frame_predictions[speech_index] = predictions[k]

    lbls = Util.generate_labels_from_classifications(frame_predictions,
                                                     timestamps)

    json_lbls = []
    for lbl in lbls:
        json_lbls.append({
            "start_seconds": lbl.start_seconds,
            "end_seconds": lbl.end_seconds,
            "label": lbl.label
        })
    with open(os.path.join(lbls_dir, youtube_video_id + ".json"),
              'w') as outfile:
        json.dump(json_lbls, outfile)

    Util.write_audacity_labels(
        lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))