def test_get_annotated_data_x_y(self): timestamps = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] data = np.random.rand(8, 10) labels = [ AudacityLabel(1.2, 2.5, "m"), AudacityLabel(4.5, 6.2, "s") ] x, y, classes, timestamps = Util.get_annotated_data_x_y(timestamps, data, labels) self.assertEqual(3, x.shape[0]) self.assertListEqual(["m", "s", "s"], y) self.assertListEqual(["m", "s"], classes) self.assertListEqual([0.0, 1.0, 2.0], timestamps) labels = [ AudacityLabel(1.0, 5.5, 'A'), AudacityLabel(5.5, 10.0, 'B'), AudacityLabel(15.0, 20.5, 'C') ] X, y, classes, new_timestamps = Util.get_annotated_data_x_y([float(i) for i in range(0, 25)], np.ones((25, 10)), labels) self.assertListEqual(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C'], y) self.assertListEqual([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0], new_timestamps)
def multimodal_analysis(video_path, output, tmp_dir, duration, output_path): video_frames_dir = os.path.abspath(os.path.join(tmp_dir, 'video_frames')) if not os.path.isdir(video_frames_dir): extract_images_from_video(os.path.abspath(video_path), video_frames_dir) generate_face_based_segmentation( output, os.path.abspath(os.path.join(tmp_dir, 'video_frames')), os.path.abspath(output_path), args.speakers, os.path.abspath('models/shape_predictor_68_face_landmarks.dat'), os.path.abspath('models/dlib_face_recognition_resnet_model_v1.dat'), tmp_dir) # fusion mapping_face_to_voice = calculate_fusion( output, os.path.abspath(output_path), Util.read_audacity_labels( os.path.join(os.path.abspath(output_path), '%s.audio.txt' % output)), Util.read_audacity_labels( os.path.join(os.path.abspath(output_path), '%s.image.txt' % output)), duration, step=0.05, neighbours_before_after=40, times_greater=4) mapping_face_to_voice_json = os.path.join( os.path.abspath(output_path), output + ".mapping_face_to_voice.json") with open(mapping_face_to_voice_json, 'w') as f: json.dump(mapping_face_to_voice, f)
def main(): parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--input-file', dest='input_file', required=True) args = parser.parse_args() input_dir = os.path.split(args.input_file)[0] temp_file = input_dir + "/temp.wav" cmd = [ "/usr/bin/ffmpeg", "-i", args.input_file, "-ar", "22050", "-ac", "1", "-acodec", "pcm_s16le", temp_file, "-y" ] subprocess.check_call(cmd) cmd = ["yaafe", "-c", FEATURE_PLAN, "-r", "22050", temp_file] subprocess.check_output(cmd) features1 = ["zcr", "flux", "spectral_rollof", "energy_stats"] features2 = ["mfcc_stats"] features3 = ["spectral_flatness_per_band"] features4 = features1 + features2 + features3 FEATURE_GROUPS = [features1, features2, features3, features4] peaks, convolution_values, timestamps = feat.get_combined_peaks( temp_file, FEATURE_GROUPS, kernel_type="gaussian") detected_segments = kernel.calculate_segment_start_end_times_from_peak_positions( peaks, timestamps) timestamps, feature_vectors = feat.read_features(features4, temp_file, scale=True) with open("/opt/speech-music-discrimination/pickled/model.pickle", 'r') as f: trained_model = pickle.load(f) frame_level_predictions = trained_model.predict(feature_vectors) annotated_segments = Util.get_annotated_labels_from_predictions_and_sm_segments( frame_level_predictions, detected_segments, timestamps) annotated_segments = Util.combine_adjacent_labels_of_the_same_class( annotated_segments) annotated_segments = feat.filter_noisy_labels(annotated_segments) annotated_segments = Util.combine_adjacent_labels_of_the_same_class( annotated_segments) Util.write_audacity_labels(annotated_segments, input_dir + "/annotated-segments.txt") for f in glob.glob(input_dir + "/*.csv"): os.remove(f) os.remove(temp_file)
def combine_audio_segments(input_dir, output_name, output_dir, clear_output_dir): if clear_output_dir: Util.remove_dir(output_dir) Util.make_dir(output_dir) files_grouped_by_class = WavEditor.get_files_grouped_by_class(input_dir) for class_name in files_grouped_by_class: input_wavs = [] for k, file in enumerate(files_grouped_by_class[class_name]): input_wavs.append(file) cmd = ['sox'] + input_wavs + [os.path.join(output_dir, '%s_%s.wav' % (output_name, class_name))] subprocess.check_call(cmd)
def main(): subprocess.check_output(["unzip", "muspeak-mirex2015-detection-examples.zip", "-d", "muspeak-mirex2015-detection-examples"]) mp3_to_wav() wav_files = glob.glob("./muspeak-mirex2015-detection-examples/*.wav") for wav_file in wav_files: print wav_file label_file = wav_file.replace(".mp3.wav", ".csv") if not os.path.isfile(label_file): label_file = label_file.replace(".csv", "_v2.csv") WavEditor.create_audio_segments(label_file, wav_file, "segments", True, ",", "f2", remove_overlapping=True) speech_wavs = glob.glob("./segments/*_s.wav") music_wavs = glob.glob("./segments/*_m.wav") all_files_dict = {} for f in speech_wavs: all_files_dict[f] = "s" for f in music_wavs: all_files_dict[f] = "m" random.seed(2222) all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys())) last_seconds = 0 files_to_concatenate = [] labels = [] for v in all_files_random_keys: duration = float(subprocess.check_output(["soxi", "-D", v]).strip()) segment_start_time = last_seconds segment_end_time = last_seconds + duration last_seconds += duration labels.append(AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v])) files_to_concatenate.append(v) audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels) Util.write_audacity_labels(audacity_labels, "mirex_combined.txt") command = [] command.append("sox") command.extend(files_to_concatenate) command.append("mirex_combined.wav") subprocess.check_output(command) shutil.rmtree("./segments") shutil.rmtree("./muspeak-mirex2015-detection-examples")
def calculate_der(reference_filename, hypothesis_filename): lbls = Util.read_audacity_labels(reference_filename) reference = Annotation() for lbl in lbls: reference[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label predicted_lbls = Util.read_audacity_labels(hypothesis_filename) hypothesis = Annotation() for lbl in predicted_lbls: if lbl.label != 'non_speech': hypothesis[Segment(lbl.start_seconds, lbl.end_seconds)] = lbl.label metric = DiarizationErrorRate() der = metric(reference, hypothesis) return der
def get_features(features, datasets_dir, pca=False): timestamps_gtzan, feature_vectors_gtzan = Util.read_merged_features( datasets_dir + "/gtzan/gtzan_combined.wav", features) labels_gtzan = Util.read_audacity_labels(datasets_dir + "/gtzan/gtzan_combined.txt") X_gtzan, Y_gtzan, lbls_gtzan = Util.get_annotated_data_x_y( timestamps_gtzan, feature_vectors_gtzan, labels_gtzan) timestamps_labrosa, feature_vectors_labrosa = Util.read_merged_features( datasets_dir + "/labrosa/labrosa_combined.wav", features) labels_labrosa = Util.read_audacity_labels(datasets_dir + "/labrosa/labrosa_combined.txt") X_labrosa, Y_labrosa, lbls_labrosa = Util.get_annotated_data_x_y( timestamps_labrosa, feature_vectors_labrosa, labels_labrosa) timestamps_mirex, feature_vectors_mirex = Util.read_merged_features( datasets_dir + "/mirex/mirex_combined.wav", features) labels_mirex = Util.read_audacity_labels(datasets_dir + "/mirex/mirex_combined.txt") X_mirex, Y_mirex, lbls_mirex = Util.get_annotated_data_x_y( timestamps_mirex, feature_vectors_mirex, labels_mirex) scaler = StandardScaler() scaler.fit(np.concatenate((X_labrosa, X_gtzan, X_mirex))) with open("pickled/scaler.pickle", 'w') as f: pickle.dump(scaler, f) X_gtzan = scaler.transform(X_gtzan) X_labrosa = scaler.transform(X_labrosa) X_mirex = scaler.transform(X_mirex) if pca: pca = PCA(n_components=20) pca.fit(np.concatenate((X_labrosa, X_gtzan, X_mirex))) X_gtzan = pca.transform(X_gtzan) X_labrosa = pca.transform(X_labrosa) X_mirex = pca.transform(X_mirex) data = { "x_gtzan": X_gtzan, "y_gtzan": Y_gtzan, "labels_gtzan": labels_gtzan, "x_labrosa": X_labrosa, "y_labrosa": Y_labrosa, "labels_labrosa": labels_labrosa, "x_mirex": X_mirex, "y_mirex": Y_mirex, "labels_mirex": labels_mirex, "timestamps_gtzan": timestamps_gtzan, "timestamps_labrosa": timestamps_labrosa, "timestamps_mirex": timestamps_mirex } return data
def test_non_maximum_suppression(self): peaks = [2, 3, 4, 7, 8, 9] values = [2, 2, 2, 3, 2, 2, 2, 4, 5, 4, 2, 2, 2] p, v = Util.non_maximum_suppression(peaks, values) self.assertListEqual([3, 8], p) self.assertListEqual(values, v)
def test_get_annotation_time_shift(self): labelsA = [ AudacityLabel(1.0, 5.0, 'A'), AudacityLabel(5.0, 10.0, 'B'), AudacityLabel(15.0, 20.0, 'B') ] new_labels = Util.get_annotation_time_shift(labelsA) expected = [ { 'old_label': AudacityLabel(1.0, 5.0, 'A'), 'new_label': AudacityLabel(0.0, 4.0, 'A'), 'shift': 1.0 }, { 'old_label': AudacityLabel(5.0, 10.0, 'B'), 'new_label': AudacityLabel(4.0, 9.0, 'B'), 'shift': 1.0 }, { 'old_label': AudacityLabel(15.0, 20.0, 'B'), 'new_label': AudacityLabel(9.0, 14.0, 'B'), 'shift': 6.0 } ] self.assertListEqual(expected, new_labels)
def test_get_unshifted_labels(self): predicted_lbls = [ AudacityLabel(1.0, 3.0, "A"), AudacityLabel(8.0, 12.0, "B") ] shifted_unshifted_labels = [ { 'old_label': AudacityLabel(1.0, 5.0, 'A'), 'new_label': AudacityLabel(0.0, 4.0, 'A'), 'shift': 1.0 }, { 'old_label': AudacityLabel(5.0, 10.0, 'B'), 'new_label': AudacityLabel(4.0, 9.0, 'B'), 'shift': 1.0 }, { 'old_label': AudacityLabel(15.0, 20.0, 'B'), 'new_label': AudacityLabel(9.0, 14.0, 'B'), 'shift': 6.0 } ] lbls = Util.get_unshifted_labels(predicted_lbls, shifted_unshifted_labels) self.assertListEqual([AudacityLabel(2.0, 4.0, 'A'), AudacityLabel(9.0, 18.0, 'B')], lbls)
def test_get_unshifted_timestamps(self): lbls = [ { 'old_label': AudacityLabel(1.0, 5.0, 'A'), 'new_label': AudacityLabel(0.0, 4.0, 'A'), 'shift': 1.0 }, { 'old_label': AudacityLabel(5.0, 10.0, 'B'), 'new_label': AudacityLabel(4.0, 9.0, 'B'), 'shift': 1.0 }, { 'old_label': AudacityLabel(15.0, 20.0, 'B'), 'new_label': AudacityLabel(9.0, 14.0, 'B'), 'shift': 6.0 } ] shifted_timestamps = [3.0, 4.0, 11.0] expected_unshifted_timestamps = [ shifted_timestamps[0] + lbls[0]['shift'], shifted_timestamps[1] + lbls[1]['shift'], shifted_timestamps[2] + lbls[2]['shift'] ] unshifted_timestamps = Util.get_unshifted_timestamps(shifted_timestamps, lbls) self.assertListEqual(expected_unshifted_timestamps, unshifted_timestamps)
def read_features(features, wavfile, scale=False): timestamps, feature_vectors = Util.read_merged_features(wavfile, features) if scale: with open("/opt/speech-music-discrimination/pickled/scaler.pickle", 'r') as f: scaler = pickle.load(f) feature_vectors = scaler.transform(feature_vectors) return timestamps, feature_vectors
def test_combine_peaks(self): a_peaks = [3, 8] a_peak_values = [2, 2, 2, 3, 2, 2, 2, 4, 5, 4, 2, 2, 2] b_peaks = [6] b_peak_values = [2, 2, 2, 2, 2, 2, 7, 2, 2, 2, 2, 2, 2] p, v = Util.combine_peaks(a_peaks, a_peak_values, b_peaks, b_peak_values) self.assertListEqual([3, 6, 8], p)
def test_get_annotated_labels_from_predictions_and_sm_segments(self): timestamps = [0, 1, 2, 3, 4, 5, 6] segments = [ AudacityLabel(0, 2.5, '-'), AudacityLabel(2.5, 5.5, '-') ] frame_level_predictions = np.array(['v', 'v', 'v', 's', 's', 'v', 'v']) labels = Util.get_annotated_labels_from_predictions_and_sm_segments(frame_level_predictions, segments, timestamps) self.assertListEqual(['v', 's'], [l.label for l in labels])
def get_files_grouped_by_class(input_dir): input_dir = os.path.abspath(input_dir) files = os.listdir(input_dir) files_grouped_by_class = {} for file in files: file_class = Util.get_class_from_filename(file) if not file_class in files_grouped_by_class: files_grouped_by_class[file_class] = [] files_grouped_by_class[file_class].append(os.path.join(input_dir, file)) return files_grouped_by_class
def calculate_fusion(youtube_video_id, lbls_dir, audio_lbls, image_lbls, duration, step=0.1, neighbours_before_after=6, times_greater=2): # 100ms pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step) mapping_face_to_voice = detect_face_voice_mapping(pairs) # print(mapping_face_to_voice) pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice) # print(pairs) seconds_of_mismatch = 0 for k, pair in enumerate(pairs): if pair.image_class is None: # when image is None continue classes = pair.image_class.split(",") # if only one face has been detected then assume it's the face of the speaker if len(classes) == 1 and pair.audio_class != 'non_speech': if pair.image_class != pair.audio_class: seconds_of_mismatch += step # print("%s != %s" % (pair.image_class, pair.audio_class)) nearest_neighbour_class = find_nearest_neighbours_class( k, pairs, neighbours_before_after=neighbours_before_after, times_greater=times_greater) pair.audio_class = nearest_neighbour_class lbls = Util.generate_labels_from_classifications( [p.audio_class for p in pairs], timestamps) lbls = list(filter(lambda x: x.label is not None, lbls)) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".fusion.txt")) return mapping_face_to_voice
def test_split_data_based_on_annotation(self): X = np.array([ [1, 2, 3, 4], [2, 3, 4, 5], [4, 5, 6, 7] ]) Y = [0, 0, 1] classes = ["music", "speech"] data = Util.split_data_based_on_annotation(X, Y, classes) self.assertEqual(data["music"].shape[0], 2) self.assertEqual(data["speech"].shape[0], 1)
def test_get_annotated_data(self): timestamps = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0] data = np.random.rand(7, 10) labels = [ AudacityLabel(1.2, 2.5, "m"), AudacityLabel(4.5, 6.0, "s") ] annotated_data = Util.get_annotated_data(timestamps, data, labels) self.assertTrue("m" in annotated_data) self.assertTrue("s" in annotated_data) self.assertTrue(data[2, :] in annotated_data["m"]) self.assertTrue(data[5, :] in annotated_data["s"]) self.assertTrue(data[6, :] in annotated_data["s"])
def create_audio_segments(labels, input_wav, output_dir, clear_output_dir, delimiter, format, remove_overlapping=False): if clear_output_dir: Util.remove_dir(output_dir) Util.make_dir(output_dir) rows = WavEditor.get_rows(labels, delimiter) if format == "f2" and remove_overlapping: rows = WavEditor.get_non_overlapping_items(rows) for k, row in enumerate(rows): if format == "f1" or (format == "f2" and remove_overlapping): start_time = row[0] end_time = row[1] elif format == "f2": start_time = row[0] end_time = str(float(row[0]) + float(row[1])) else: logging.error("not supported file format") sys.exit(1) label = row[2] filename = str(uuid.uuid4()) WavEditor.create_audio_segment(start_time, end_time, input_wav, os.path.join(output_dir, "%s_%s.wav" % (filename, label)))
def calculate_fusion(youtube_video_id, lbls_dir, audio_lbls, image_lbls, duration, step=0.1): # 100ms pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step) mapping_face_to_voice = detect_face_voice_mapping(pairs) print(mapping_face_to_voice) pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice) print(pairs) for k, pair in enumerate(pairs): if pair.image_class is None: # when image is None continue classes = pair.image_class.split(",") if len(classes) == 1 and pair.audio_class != 'non_speech': if pair.image_class != pair.audio_class: # print("%s != %s" % (pair.image_class, pair.audio_class)) nearest_neighbour_class = find_nearest_neighbours_class(k, pairs) pair.audio_class = nearest_neighbour_class print(pairs) lbls = Util.generate_labels_from_classifications([p.audio_class for p in pairs], timestamps) lbls = filter(lambda x: x.label is not None, lbls) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels(lbls, os.path.join(lbls_dir, youtube_video_id + ".txt")) return mapping_face_to_voice
def test_combine_adjacent_labels_of_the_same_class(self): input_labels = [ AudacityLabel(0, 10, "m"), AudacityLabel(10, 20, "m"), AudacityLabel(20, 21, "s"), AudacityLabel(21, 22, "s"), AudacityLabel(22, 23, "s"), AudacityLabel(23, 30, "m") ] expected_labels = [ AudacityLabel(0, 20, "m"), AudacityLabel(20, 23, "s"), AudacityLabel(23, 30, "m"), ] actual_labels = Util.combine_adjacent_labels_of_the_same_class(input_labels) self.assertListEqual(expected_labels, actual_labels)
def test_generate_labels_from_classifications(self): classifications = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1] timestamps = [0.0, 0.6965986394557823, 1.3931972789115645, 2.089795918367347, 2.786394557823129, 3.4829931972789114, 4.179591836734694, 4.876190476190477, 5.572789115646258, 6.2693877551020405, 6.965986394557823, 7.662585034013605, 8.359183673469389, 9.05578231292517, 9.752380952380953, 10.448979591836734, 11.145578231292516, 11.842176870748299, 12.538775510204081, 13.235374149659863, 13.931972789115646, 14.628571428571428, 15.32517006802721, 16.021768707482995] labels = Util.generate_labels_from_classifications(classifications, timestamps) expected_labels = [AudacityLabel(0.0, 1.3931972789115645, 1), AudacityLabel(1.3931972789115645, 9.752380952380953, 0), AudacityLabel(9.752380952380953, 10.448979591836736, 1), AudacityLabel(10.448979591836734, 11.145578231292516, 0), AudacityLabel(11.145578231292516, 12.538775510204081, 1), AudacityLabel(12.538775510204081, 13.235374149659863, 0), AudacityLabel(13.235374149659863, 16.718367346938777, 1)] self.assertListEqual(expected_labels, labels)
def test_load_yaafe_csv_double_stats(self): timestamps, features = Util.load_yaafe_csv(self.double_stats_csv) self.assertEqual((17, 2), features.shape) self.assertEqual(334.3673469387755, timestamps[-1])
def test_load_yaafe_csv_stats(self): timestamps, features = Util.load_yaafe_csv(self.stats_csv) self.assertEqual((17, 2), features.shape) self.assertEqual(11.145578231292516, timestamps[-1])
def test_load_yaafe_csv_no_stats(self): timestamps, features = Util.load_yaafe_csv(self.no_stats_csv) self.assertEqual((17, 2), features.shape) self.assertEqual(0.37151927437641724, timestamps[-1])
def test_parse_yaafe_header_stats_derivate(self): header = Util.parse_yaafe_header(self.stats_derivate) self.assertEqual(22050, header['samplerate']) self.assertEqual(15360, header['effective_step_size'])
def test_calculate_classes_percentages(self): classifications = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1] percentages = Util.calculate_classes_percentages(classifications) self.assertAlmostEqual(0.5833333333333334, percentages[0]) self.assertAlmostEqual(0.4166666666666667, percentages[1])
for f in music_wavs: all_files_dict[f] = "m" random.seed(1111) all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys())) last_seconds = 0 files_to_concatenate = [] labels = [] for v in all_files_random_keys: duration = float(subprocess.check_output(["soxi", "-D", v]).strip()) segment_start_time = last_seconds segment_end_time = last_seconds + duration last_seconds += duration labels.append(AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v])) files_to_concatenate.append(v) audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels) Util.write_audacity_labels(audacity_labels, "gtzan_combined.txt") command = [] command.append("sox") command.extend(files_to_concatenate) command.append("gtzan_combined.wav") subprocess.check_output(command) subprocess.call(['chmod', '-R', '777', './music_speech']) shutil.rmtree("./music_speech")
def x(youtube_video_id): job = Job.query.filter_by(video_id=youtube_video_id).first() if job is not None: try: job.start_time = datetime.utcnow() subprocess.check_call([ 'youtube-dl', '-f', '18', '--write-thumbnail', '-o', 'videos/%(id)s.%(ext)s', 'https://www.youtube.com/watch?v=%s' % youtube_video_id ]) copyfile('videos/%s.jpg' % youtube_video_id, 'static/img/thumbs/%s.jpg' % youtube_video_id) set_state(State.VIDEO_DOWNLOADED, db, job) subprocess.check_call([ 'ffmpeg', '-y', '-i', 'videos/%s.mp4' % youtube_video_id, '-ar', '16000', '-ac', '1', 'audios/%s.wav' % youtube_video_id ]) set_state(State.AUDIO_EXTRACTED, db, job) y, sr = librosa.load("audios/%s.wav" % youtube_video_id, sr=16000) D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max) D = np.flipud(D) D8 = (((D - D.min()) / (D.max() - D.min())) * 255.9).astype( np.uint8) img = Image.fromarray(D8) img = img.resize((D.shape[1], 64)) img.save("static/img/waveforms/%s.jpg" % youtube_video_id) duration = librosa.get_duration(y=y, sr=sr) job.waveform_width = D.shape[1] job.duration = duration set_state(State.WAVEFORM_GENERATED, db, job) if duration > 600: raise Exception("Video duration greated than 10 min (600 sec)") # audio based segmentation generate_audio_based_segmentation( os.path.abspath('audios/%s.wav' % youtube_video_id), 15, 20, 256, 128, 0.2, os.path.abspath('models/weights.h5'), os.path.abspath('models/scaler.pickle'), 1024, 3, 1024, youtube_video_id, os.path.abspath('static/lbls/audio'), clusters=job.number_of_speakers) set_state(State.AUDIO_DATA_ANALYSED, db, job) steps.mfcc.generate_audio_based_segmentation( os.path.abspath('audios/%s.wav' % youtube_video_id), 15, 20, 256, 128, 0.2, os.path.abspath('models/weights.h5'), os.path.abspath('models/scaler.pickle'), 1024, 3, 1024, youtube_video_id, os.path.abspath('static/lbls/mfcc'), clusters=job.number_of_speakers) set_state(State.MFCC_ANALYSED, db, job) # face based segmentation extract_images_from_video( os.path.abspath('videos/%s.mp4' % youtube_video_id), os.path.abspath('video_frames')) generate_face_based_segmentation( youtube_video_id, os.path.abspath('video_frames/%s' % youtube_video_id), os.path.abspath('static/lbls/image'), job.number_of_speakers, os.path.abspath( 'models/shape_predictor_68_face_landmarks.dat'), os.path.abspath( 'models/dlib_face_recognition_resnet_model_v1.dat')) set_state(State.IMAGE_DATA_ANALYSED, db, job) # fusion mapping_face_to_voice = calculate_fusion( youtube_video_id, os.path.abspath('static/lbls/fusion'), Util.read_audacity_labels( os.path.abspath('static/lbls/audio/%s.txt' % youtube_video_id)), Util.read_audacity_labels( os.path.abspath('static/lbls/image/%s.txt' % youtube_video_id)), duration) job.mapping_face_to_voice = mapping_face_to_voice set_state(State.FUSION_APPLIED, db, job) job.end_time = datetime.utcnow() set_state(State.DONE, db, job) except Exception as e: print(e) job.end_time = datetime.utcnow() set_state(State.ERROR, db, job, str(e)) raise e
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir, faces, predictor_path, face_rec_model_path, tmp_dir): images_raw = glob(os.path.join(images_dir, "*.jpg")) images_raw.sort() # images_raw = images_raw[0:100] images = [ images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP) ] print(images) timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))] print(timestamps) detector = get_frontal_face_detector() sp = dlib.shape_predictor(predictor_path) facerec = face_recognition_model_v1(face_rec_model_path) embeddings = [] embeddings_timestamps = [] landmarks_parts = [] landmarks_rect = [] embeddings_pickle = os.path.join(tmp_dir, "embeddings.npy") embeddings_timestamps_pickle = os.path.join(tmp_dir, "embeddings_timestamps.npy") if not os.path.isfile(embeddings_pickle) or not os.path.isfile( embeddings_timestamps_pickle): for frame_no, f in enumerate(images): print("Processing file: {}".format(f)) img = io.imread(f) dets = detector(img, 1) print("Number of faces detected: {}".format(len(dets))) for k, d in enumerate(dets): shape = sp(img, d) face_descriptor = facerec.compute_face_descriptor(img, shape) embeddings.append(face_descriptor) embeddings_timestamps.append(timestamps[frame_no]) landmarks_parts.append(shape.parts()) landmarks_rect.append(shape.rect) embeddings = numpy.array(embeddings) embeddings_timestamps = numpy.array(embeddings_timestamps) numpy.save(embeddings_pickle, embeddings) numpy.save(embeddings_timestamps_pickle, embeddings_timestamps) else: embeddings = numpy.load(embeddings_pickle) embeddings_timestamps = numpy.load(embeddings_timestamps_pickle) print(embeddings.shape) print(embeddings_timestamps.shape) if len(embeddings) == 0: Util.write_audacity_labels([], os.path.join( lbls_dir, youtube_video_id + ".image.txt")) return kmeans = KMeans(n_clusters=faces) kmeans.fit(embeddings) predictions = numpy.array(kmeans.labels_.tolist()) df = pd.DataFrame({ "timestamps": embeddings_timestamps.tolist(), "predictions": predictions }) timestamps = [] classes = [] for key, group in df.groupby(['timestamps']): timestamps.append(key) classes.append(",".join( [str(i) for i in sorted(group['predictions'].tolist())])) lbls = Util.generate_labels_from_classifications(classes, timestamps) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".image.txt"))
import itertools import pickle import numpy as np from sac.util import Util import os import feat from sklearn.svm import SVC DATASETS = os.path.abspath("../datasets") features = Util.read_feature_names_from_file( os.path.join(DATASETS, "featureplans/featureplan")) features1 = ["zcr", "flux", "spectral_rollof", "energy_stats"] features2 = ["mfcc_stats"] features3 = ["spectral_flatness_per_band"] features4 = features1 + features2 + features3 data = feat.get_features(features4, DATASETS, pca=False) TRAIN = ["mirex", "labrosa", "gtzan"] model = SVC() X = np.vstack((data["x_" + i] for i in TRAIN)) Y = list(itertools.chain.from_iterable([data["y_" + i] for i in TRAIN])) model.fit(X, Y) with open("pickled/model.pickle", 'w') as f: pickle.dump(model, f)
import itertools import pickle import numpy as np from sac.util import Util import os import feat from sklearn.svm import SVC DATASETS = os.path.abspath("../datasets") features = Util.read_feature_names_from_file(os.path.join(DATASETS, "featureplans/featureplan")) features1 = ["zcr", "flux", "spectral_rollof", "energy_stats"] features2 = ["mfcc_stats"] features3 = ["spectral_flatness_per_band"] features4 = features1 + features2 + features3 data = feat.get_features(features4, DATASETS, pca=False) TRAIN = ["mirex", "labrosa", "gtzan"] model = SVC() X = np.vstack((data["x_" + i] for i in TRAIN)) Y = list(itertools.chain.from_iterable([data["y_" + i] for i in TRAIN])) model.fit(X, Y) with open("pickled/model.pickle", 'w') as f: pickle.dump(model, f)
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir, faces, predictor_path, face_rec_model_path): images_raw = glob(os.path.join(images_dir, "*.jpg")) images_raw.sort() # images_raw = images_raw[0:100] images = [ images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP) ] print(images) timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))] print(timestamps) detector = dlib.get_frontal_face_detector() sp = dlib.shape_predictor(predictor_path) facerec = dlib.face_recognition_model_v1(face_rec_model_path) embeddings = [] embeddings_timestamps = [] landmarks_parts = [] landmarks_rect = [] for frame_no, f in enumerate(images): print("Processing file: {}".format(f)) img = io.imread(f) dets = detector(img, 1) print("Number of faces detected: {}".format(len(dets))) for k, d in enumerate(dets): shape = sp(img, d) face_descriptor = facerec.compute_face_descriptor(img, shape) embeddings.append(face_descriptor) embeddings_timestamps.append(timestamps[frame_no]) landmarks_parts.append(shape.parts()) landmarks_rect.append(shape.rect) embeddings = numpy.array(embeddings) embeddings_timestamps = numpy.array(embeddings_timestamps) print(embeddings.shape) print(embeddings_timestamps.shape) if len(embeddings) == 0: Util.write_audacity_labels([], os.path.join(lbls_dir, youtube_video_id + ".txt")) return kmeans = KMeans(n_clusters=faces) kmeans.fit(embeddings) predictions = numpy.array(kmeans.labels_.tolist()) df = pd.DataFrame({ "timestamps": embeddings_timestamps.tolist(), "predictions": predictions }) timestamps = [] classes = [] for key, group in df.groupby(['timestamps']): timestamps.append(key) classes.append(",".join( [str(i) for i in sorted(group['predictions'].tolist())])) lbls = Util.generate_labels_from_classifications(classes, timestamps) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".txt")) # if __name__ == '__main__': # # extract_images_from_video("/Users/nicktgr15/workspace/speaker_diarisation_poc/src/videos/Unamij6z1io.mp4", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames") # # generate_face_based_segmentation( # "Unamij6z1io", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames/Unamij6z1io", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/image", # 4, # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/shape_predictor_68_face_landmarks.dat", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/dlib_face_recognition_resnet_model_v1.dat" # )
def generate_audio_based_segmentation(audio_file, w, h, embedding_size, lstm_nodes, dropout, weights_filename, scaler_filename, window_size, step, hop_size, youtube_video_id, lbls_dir, clusters=4, sr=16000): vad = Vad() vad_lbls = vad.detect_voice_segments(audio_file) with open(scaler_filename, 'rb') as f: scaler = pickle.load(f) model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes, dropout) model.load_weights(weights_filename) feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step) X, timestamps = feature_extractor.extract(audio_file) timestamps = numpy.array(timestamps) window = timestamps[1] - timestamps[0] frame_predictions = [] for k, timestamp in enumerate(timestamps): found = False for lbl in vad_lbls: if lbl.start_seconds <= timestamp <= lbl.end_seconds - window: # need the window end to fit in the label frame_predictions.append(lbl.label) found = True break if not found: frame_predictions.append('non_speech') frame_predictions = numpy.array(frame_predictions) print(frame_predictions.shape) print(timestamps.shape) speech_indices = numpy.where(frame_predictions == 'speech') X_speech = X[speech_indices] X = X_speech.reshape((X_speech.shape[0] * w, h)) X = scaler.transform(X) X = X.reshape(-1, w, h) original_embeddings = intermediate.predict(X) clustering_algorithm = KMeans(n_clusters=clusters) reducted_embeddings = original_embeddings predictions = clustering_algorithm.fit_predict(reducted_embeddings) for k, speech_index in enumerate(speech_indices[0]): frame_predictions[speech_index] = predictions[k] lbls = Util.generate_labels_from_classifications(frame_predictions, timestamps) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".audio.txt"))
all_files_dict[f] = "m" random.seed(1111) all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys())) last_seconds = 0 files_to_concatenate = [] labels = [] for v in all_files_random_keys: duration = float(subprocess.check_output(["soxi", "-D", v]).strip()) segment_start_time = last_seconds segment_end_time = last_seconds + duration last_seconds += duration labels.append( AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v])) files_to_concatenate.append(v) audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels) Util.write_audacity_labels(audacity_labels, "gtzan_combined.txt") command = [] command.append("sox") command.extend(files_to_concatenate) command.append("gtzan_combined.wav") subprocess.check_output(command) subprocess.call(['chmod', '-R', '777', './music_speech']) shutil.rmtree("./music_speech")
def test_parse_yaafe_header_double_stats(self): header = Util.parse_yaafe_header(self.double_stats_csv) self.assertEqual(22050, header['samplerate']) self.assertEqual(460800, header['effective_step_size'])
def generate_audio_based_segmentation(audio_file, w, h, embedding_size, lstm_nodes, dropout, weights_filename, scaler_filename, window_size, step, hop_size, youtube_video_id, lbls_dir, clusters=4, sr=16000): vad = Vad() vad_lbls = vad.detect_voice_segments(audio_file) # model.load_weights(weights_filename) # feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step) # X, timestamps = feature_extractor.extract(audio_file) # timestamps = numpy.array(timestamps) y, sr = librosa.load(audio_file, sr=sr) X = mfcc(y, sr=sr, n_mfcc=h, n_fft=window_size, hop_length=hop_size) timestamps = [k * hop_size / sr for k in range(0, X.shape[1])] timestamps = numpy.array(timestamps) window = timestamps[1] - timestamps[0] frame_predictions = [] for k, timestamp in enumerate(timestamps): found = False for lbl in vad_lbls: if lbl.start_seconds <= timestamp <= lbl.end_seconds - window: # need the window end to fit in the label frame_predictions.append(lbl.label) found = True break if not found: frame_predictions.append('non_speech') frame_predictions = numpy.array(frame_predictions) speech_indices = numpy.where(frame_predictions == 'speech') X_speech = X[:, speech_indices] X_speech = X_speech.reshape((X_speech.shape[0], X_speech.shape[2])) print(X_speech.shape) X_speech = X_speech.transpose() print(X_speech.shape) timestamps_speech = timestamps[speech_indices] # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps), # "vad_preds_quant.txt") # X = X_speech.reshape((X_speech.shape[0] * w, h)) # X = scaler.transform(X) # X = X.reshape(-1, w, h) # original_embeddings = intermediate.predict(X) clustering_algorithm = GaussianMixture(n_components=clusters, max_iter=1000, n_init=3) # if visualise: # # embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings, # lbls_fixed) # le = preprocessing.LabelEncoder() # y = le.fit_transform(y) # # tsne = TSNE() # two_dimensional = tsne.fit_transform(embeddings) # # # pca = PCA(n_components=2) # # two_dimensional = pca.fit_transform(embeddings) # # # pca2 = PCA(n_components=20) # # pca_embeddings = pca2.fit_transform(embeddings) # # clustering_algorithm.fit(two_dimensional) # predictions = clustering_algorithm.predict(two_dimensional) # # # kmeans = KMeans(n_clusters=CLUSTERS) # # kmeans.fit(embeddings) # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.') # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # # else: tsne = TSNE(n_components=2, init='pca') two_dimensional = tsne.fit_transform(X_speech) # original_embeddings = scale((original_embeddings)) # pca = PCA(n_components=2) # two_dimensional = pca.fit_transform(original_embeddings) # pca2 = PCA(n_components=3) # pca_embeddings = pca2.fit_transform(original_embeddings) clustering_algorithm.fit(two_dimensional) predictions = clustering_algorithm.predict(two_dimensional) # kmeans = KMeans(n_clusters=CLUSTERS) # kmeans.fit(two_dimensional) # plt.figure(figsize=(10,10)) # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.') # predictions = kmeans.labels_.tolist() for k, speech_index in enumerate(speech_indices[0]): frame_predictions[speech_index] = predictions[k] lbls = Util.generate_labels_from_classifications(frame_predictions, timestamps) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".txt")) # if __name__ == '__main__': # generate_audio_based_segmentation( # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/audios/Unamij6z1io.wav", # 15, 20, 256, 128, 0.2, # os.path.abspath('models/weights.h5'), # os.path.abspath('models/scaler.pickle'), # 1024, 3, 1024, "xxx", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/mfcc", # clusters=4 # )
def generate_audio_based_segmentation(audio_file, w, h, embedding_size, lstm_nodes, dropout, weights_filename, scaler_filename, window_size, step, hop_size, youtube_video_id, lbls_dir, clusters=4, sr=16000): vad = Vad() vad_lbls = vad.detect_voice_segments(audio_file) with open(scaler_filename, 'rb') as f: scaler = pickle.load(f) model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes, dropout) model.load_weights(weights_filename) feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step) X, timestamps = feature_extractor.extract(audio_file) timestamps = numpy.array(timestamps) window = timestamps[1] - timestamps[0] frame_predictions = [] for k, timestamp in enumerate(timestamps): found = False for lbl in vad_lbls: if lbl.start_seconds <= timestamp <= lbl.end_seconds - window: # need the window end to fit in the label frame_predictions.append(lbl.label) found = True break if not found: frame_predictions.append('non_speech') frame_predictions = numpy.array(frame_predictions) print(frame_predictions.shape) print(timestamps.shape) speech_indices = numpy.where(frame_predictions == 'speech') X_speech = X[speech_indices] timestamps_speech = timestamps[speech_indices] # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps), # "vad_preds_quant.txt") X = X_speech.reshape((X_speech.shape[0] * w, h)) X = scaler.transform(X) X = X.reshape(-1, w, h) original_embeddings = intermediate.predict(X) clustering_algorithm = GaussianMixture(n_components=clusters, max_iter=1000, n_init=3) # if visualise: # # embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings, # lbls_fixed) # le = preprocessing.LabelEncoder() # y = le.fit_transform(y) # # tsne = TSNE() # two_dimensional = tsne.fit_transform(embeddings) # # # pca = PCA(n_components=2) # # two_dimensional = pca.fit_transform(embeddings) # # # pca2 = PCA(n_components=20) # # pca_embeddings = pca2.fit_transform(embeddings) # # clustering_algorithm.fit(two_dimensional) # predictions = clustering_algorithm.predict(two_dimensional) # # # kmeans = KMeans(n_clusters=CLUSTERS) # # kmeans.fit(embeddings) # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.') # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # # else: tsne = TSNE(n_components=2, init='pca') two_dimensional = tsne.fit_transform(original_embeddings) # original_embeddings = scale((original_embeddings)) # pca = PCA(n_components=2) # two_dimensional = pca.fit_transform(original_embeddings) # pca2 = PCA(n_components=3) # pca_embeddings = pca2.fit_transform(original_embeddings) clustering_algorithm.fit(two_dimensional) predictions = clustering_algorithm.predict(two_dimensional) # kmeans = KMeans(n_clusters=CLUSTERS) # kmeans.fit(two_dimensional) # plt.figure(figsize=(10,10)) # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.') # predictions = kmeans.labels_.tolist() for k, speech_index in enumerate(speech_indices[0]): frame_predictions[speech_index] = predictions[k] lbls = Util.generate_labels_from_classifications(frame_predictions, timestamps) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))