def main(): parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--input-file', dest='input_file', required=True) args = parser.parse_args() input_dir = os.path.split(args.input_file)[0] temp_file = input_dir + "/temp.wav" cmd = [ "/usr/bin/ffmpeg", "-i", args.input_file, "-ar", "22050", "-ac", "1", "-acodec", "pcm_s16le", temp_file, "-y" ] subprocess.check_call(cmd) cmd = ["yaafe", "-c", FEATURE_PLAN, "-r", "22050", temp_file] subprocess.check_output(cmd) features1 = ["zcr", "flux", "spectral_rollof", "energy_stats"] features2 = ["mfcc_stats"] features3 = ["spectral_flatness_per_band"] features4 = features1 + features2 + features3 FEATURE_GROUPS = [features1, features2, features3, features4] peaks, convolution_values, timestamps = feat.get_combined_peaks( temp_file, FEATURE_GROUPS, kernel_type="gaussian") detected_segments = kernel.calculate_segment_start_end_times_from_peak_positions( peaks, timestamps) timestamps, feature_vectors = feat.read_features(features4, temp_file, scale=True) with open("/opt/speech-music-discrimination/pickled/model.pickle", 'r') as f: trained_model = pickle.load(f) frame_level_predictions = trained_model.predict(feature_vectors) annotated_segments = Util.get_annotated_labels_from_predictions_and_sm_segments( frame_level_predictions, detected_segments, timestamps) annotated_segments = Util.combine_adjacent_labels_of_the_same_class( annotated_segments) annotated_segments = feat.filter_noisy_labels(annotated_segments) annotated_segments = Util.combine_adjacent_labels_of_the_same_class( annotated_segments) Util.write_audacity_labels(annotated_segments, input_dir + "/annotated-segments.txt") for f in glob.glob(input_dir + "/*.csv"): os.remove(f) os.remove(temp_file)
def main(): subprocess.check_output(["unzip", "muspeak-mirex2015-detection-examples.zip", "-d", "muspeak-mirex2015-detection-examples"]) mp3_to_wav() wav_files = glob.glob("./muspeak-mirex2015-detection-examples/*.wav") for wav_file in wav_files: print wav_file label_file = wav_file.replace(".mp3.wav", ".csv") if not os.path.isfile(label_file): label_file = label_file.replace(".csv", "_v2.csv") WavEditor.create_audio_segments(label_file, wav_file, "segments", True, ",", "f2", remove_overlapping=True) speech_wavs = glob.glob("./segments/*_s.wav") music_wavs = glob.glob("./segments/*_m.wav") all_files_dict = {} for f in speech_wavs: all_files_dict[f] = "s" for f in music_wavs: all_files_dict[f] = "m" random.seed(2222) all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys())) last_seconds = 0 files_to_concatenate = [] labels = [] for v in all_files_random_keys: duration = float(subprocess.check_output(["soxi", "-D", v]).strip()) segment_start_time = last_seconds segment_end_time = last_seconds + duration last_seconds += duration labels.append(AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v])) files_to_concatenate.append(v) audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels) Util.write_audacity_labels(audacity_labels, "mirex_combined.txt") command = [] command.append("sox") command.extend(files_to_concatenate) command.append("mirex_combined.wav") subprocess.check_output(command) shutil.rmtree("./segments") shutil.rmtree("./muspeak-mirex2015-detection-examples")
def calculate_fusion(youtube_video_id, lbls_dir, audio_lbls, image_lbls, duration, step=0.1, neighbours_before_after=6, times_greater=2): # 100ms pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step) mapping_face_to_voice = detect_face_voice_mapping(pairs) # print(mapping_face_to_voice) pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice) # print(pairs) seconds_of_mismatch = 0 for k, pair in enumerate(pairs): if pair.image_class is None: # when image is None continue classes = pair.image_class.split(",") # if only one face has been detected then assume it's the face of the speaker if len(classes) == 1 and pair.audio_class != 'non_speech': if pair.image_class != pair.audio_class: seconds_of_mismatch += step # print("%s != %s" % (pair.image_class, pair.audio_class)) nearest_neighbour_class = find_nearest_neighbours_class( k, pairs, neighbours_before_after=neighbours_before_after, times_greater=times_greater) pair.audio_class = nearest_neighbour_class lbls = Util.generate_labels_from_classifications( [p.audio_class for p in pairs], timestamps) lbls = list(filter(lambda x: x.label is not None, lbls)) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".fusion.txt")) return mapping_face_to_voice
def calculate_fusion(youtube_video_id, lbls_dir, audio_lbls, image_lbls, duration, step=0.1): # 100ms pairs, timestamps = create_pairs(audio_lbls, image_lbls, duration, step) mapping_face_to_voice = detect_face_voice_mapping(pairs) print(mapping_face_to_voice) pairs = apply_mapping_to_pairs(pairs, mapping_face_to_voice) print(pairs) for k, pair in enumerate(pairs): if pair.image_class is None: # when image is None continue classes = pair.image_class.split(",") if len(classes) == 1 and pair.audio_class != 'non_speech': if pair.image_class != pair.audio_class: # print("%s != %s" % (pair.image_class, pair.audio_class)) nearest_neighbour_class = find_nearest_neighbours_class(k, pairs) pair.audio_class = nearest_neighbour_class print(pairs) lbls = Util.generate_labels_from_classifications([p.audio_class for p in pairs], timestamps) lbls = filter(lambda x: x.label is not None, lbls) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels(lbls, os.path.join(lbls_dir, youtube_video_id + ".txt")) return mapping_face_to_voice
def generate_audio_based_segmentation(audio_file, w, h, embedding_size, lstm_nodes, dropout, weights_filename, scaler_filename, window_size, step, hop_size, youtube_video_id, lbls_dir, clusters=4, sr=16000): vad = Vad() vad_lbls = vad.detect_voice_segments(audio_file) # model.load_weights(weights_filename) # feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step) # X, timestamps = feature_extractor.extract(audio_file) # timestamps = numpy.array(timestamps) y, sr = librosa.load(audio_file, sr=sr) X = mfcc(y, sr=sr, n_mfcc=h, n_fft=window_size, hop_length=hop_size) timestamps = [k * hop_size / sr for k in range(0, X.shape[1])] timestamps = numpy.array(timestamps) window = timestamps[1] - timestamps[0] frame_predictions = [] for k, timestamp in enumerate(timestamps): found = False for lbl in vad_lbls: if lbl.start_seconds <= timestamp <= lbl.end_seconds - window: # need the window end to fit in the label frame_predictions.append(lbl.label) found = True break if not found: frame_predictions.append('non_speech') frame_predictions = numpy.array(frame_predictions) speech_indices = numpy.where(frame_predictions == 'speech') X_speech = X[:, speech_indices] X_speech = X_speech.reshape((X_speech.shape[0], X_speech.shape[2])) print(X_speech.shape) X_speech = X_speech.transpose() print(X_speech.shape) timestamps_speech = timestamps[speech_indices] # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps), # "vad_preds_quant.txt") # X = X_speech.reshape((X_speech.shape[0] * w, h)) # X = scaler.transform(X) # X = X.reshape(-1, w, h) # original_embeddings = intermediate.predict(X) clustering_algorithm = GaussianMixture(n_components=clusters, max_iter=1000, n_init=3) # if visualise: # # embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings, # lbls_fixed) # le = preprocessing.LabelEncoder() # y = le.fit_transform(y) # # tsne = TSNE() # two_dimensional = tsne.fit_transform(embeddings) # # # pca = PCA(n_components=2) # # two_dimensional = pca.fit_transform(embeddings) # # # pca2 = PCA(n_components=20) # # pca_embeddings = pca2.fit_transform(embeddings) # # clustering_algorithm.fit(two_dimensional) # predictions = clustering_algorithm.predict(two_dimensional) # # # kmeans = KMeans(n_clusters=CLUSTERS) # # kmeans.fit(embeddings) # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.') # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # # else: tsne = TSNE(n_components=2, init='pca') two_dimensional = tsne.fit_transform(X_speech) # original_embeddings = scale((original_embeddings)) # pca = PCA(n_components=2) # two_dimensional = pca.fit_transform(original_embeddings) # pca2 = PCA(n_components=3) # pca_embeddings = pca2.fit_transform(original_embeddings) clustering_algorithm.fit(two_dimensional) predictions = clustering_algorithm.predict(two_dimensional) # kmeans = KMeans(n_clusters=CLUSTERS) # kmeans.fit(two_dimensional) # plt.figure(figsize=(10,10)) # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.') # predictions = kmeans.labels_.tolist() for k, speech_index in enumerate(speech_indices[0]): frame_predictions[speech_index] = predictions[k] lbls = Util.generate_labels_from_classifications(frame_predictions, timestamps) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".txt")) # if __name__ == '__main__': # generate_audio_based_segmentation( # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/audios/Unamij6z1io.wav", # 15, 20, 256, 128, 0.2, # os.path.abspath('models/weights.h5'), # os.path.abspath('models/scaler.pickle'), # 1024, 3, 1024, "xxx", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/mfcc", # clusters=4 # )
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir, faces, predictor_path, face_rec_model_path): images_raw = glob(os.path.join(images_dir, "*.jpg")) images_raw.sort() # images_raw = images_raw[0:100] images = [ images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP) ] print(images) timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))] print(timestamps) detector = dlib.get_frontal_face_detector() sp = dlib.shape_predictor(predictor_path) facerec = dlib.face_recognition_model_v1(face_rec_model_path) embeddings = [] embeddings_timestamps = [] landmarks_parts = [] landmarks_rect = [] for frame_no, f in enumerate(images): print("Processing file: {}".format(f)) img = io.imread(f) dets = detector(img, 1) print("Number of faces detected: {}".format(len(dets))) for k, d in enumerate(dets): shape = sp(img, d) face_descriptor = facerec.compute_face_descriptor(img, shape) embeddings.append(face_descriptor) embeddings_timestamps.append(timestamps[frame_no]) landmarks_parts.append(shape.parts()) landmarks_rect.append(shape.rect) embeddings = numpy.array(embeddings) embeddings_timestamps = numpy.array(embeddings_timestamps) print(embeddings.shape) print(embeddings_timestamps.shape) if len(embeddings) == 0: Util.write_audacity_labels([], os.path.join(lbls_dir, youtube_video_id + ".txt")) return kmeans = KMeans(n_clusters=faces) kmeans.fit(embeddings) predictions = numpy.array(kmeans.labels_.tolist()) df = pd.DataFrame({ "timestamps": embeddings_timestamps.tolist(), "predictions": predictions }) timestamps = [] classes = [] for key, group in df.groupby(['timestamps']): timestamps.append(key) classes.append(",".join( [str(i) for i in sorted(group['predictions'].tolist())])) lbls = Util.generate_labels_from_classifications(classes, timestamps) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".txt")) # if __name__ == '__main__': # # extract_images_from_video("/Users/nicktgr15/workspace/speaker_diarisation_poc/src/videos/Unamij6z1io.mp4", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames") # # generate_face_based_segmentation( # "Unamij6z1io", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/video_frames/Unamij6z1io", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/static/lbls/image", # 4, # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/shape_predictor_68_face_landmarks.dat", # "/Users/nicktgr15/workspace/speaker_diarisation_poc/src/models/dlib_face_recognition_resnet_model_v1.dat" # )
all_files_dict[f] = "m" random.seed(1111) all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys())) last_seconds = 0 files_to_concatenate = [] labels = [] for v in all_files_random_keys: duration = float(subprocess.check_output(["soxi", "-D", v]).strip()) segment_start_time = last_seconds segment_end_time = last_seconds + duration last_seconds += duration labels.append( AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v])) files_to_concatenate.append(v) audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels) Util.write_audacity_labels(audacity_labels, "gtzan_combined.txt") command = [] command.append("sox") command.extend(files_to_concatenate) command.append("gtzan_combined.wav") subprocess.check_output(command) subprocess.call(['chmod', '-R', '777', './music_speech']) shutil.rmtree("./music_speech")
all_files_dict[f] = "m" random.seed(1111) all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys())) last_seconds = 0 files_to_concatenate = [] labels = [] for v in all_files_random_keys: duration = float(subprocess.check_output(["soxi", "-D", v]).strip()) segment_start_time = last_seconds segment_end_time = last_seconds + duration last_seconds += duration labels.append( AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v])) files_to_concatenate.append(v) audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels) Util.write_audacity_labels(audacity_labels, "labrosa_combined.txt") command = [] command.append("sox") command.extend(files_to_concatenate) command.append("labrosa_combined.wav") subprocess.check_output(command) subprocess.call(['chmod', '-R', '777', './music-speech']) shutil.rmtree("./music-speech")
def generate_audio_based_segmentation(audio_file, w, h, embedding_size, lstm_nodes, dropout, weights_filename, scaler_filename, window_size, step, hop_size, youtube_video_id, lbls_dir, clusters=4, sr=16000): vad = Vad() vad_lbls = vad.detect_voice_segments(audio_file) with open(scaler_filename, 'rb') as f: scaler = pickle.load(f) model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes, dropout) model.load_weights(weights_filename) feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step) X, timestamps = feature_extractor.extract(audio_file) timestamps = numpy.array(timestamps) window = timestamps[1] - timestamps[0] frame_predictions = [] for k, timestamp in enumerate(timestamps): found = False for lbl in vad_lbls: if lbl.start_seconds <= timestamp <= lbl.end_seconds - window: # need the window end to fit in the label frame_predictions.append(lbl.label) found = True break if not found: frame_predictions.append('non_speech') frame_predictions = numpy.array(frame_predictions) print(frame_predictions.shape) print(timestamps.shape) speech_indices = numpy.where(frame_predictions == 'speech') X_speech = X[speech_indices] X = X_speech.reshape((X_speech.shape[0] * w, h)) X = scaler.transform(X) X = X.reshape(-1, w, h) original_embeddings = intermediate.predict(X) clustering_algorithm = KMeans(n_clusters=clusters) reducted_embeddings = original_embeddings predictions = clustering_algorithm.fit_predict(reducted_embeddings) for k, speech_index in enumerate(speech_indices[0]): frame_predictions[speech_index] = predictions[k] lbls = Util.generate_labels_from_classifications(frame_predictions, timestamps) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".audio.txt"))
for f in music_wavs: all_files_dict[f] = "m" random.seed(1111) all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys())) last_seconds = 0 files_to_concatenate = [] labels = [] for v in all_files_random_keys: duration = float(subprocess.check_output(["soxi", "-D", v]).strip()) segment_start_time = last_seconds segment_end_time = last_seconds + duration last_seconds += duration labels.append(AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v])) files_to_concatenate.append(v) audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels) Util.write_audacity_labels(audacity_labels, "gtzan_combined.txt") command = [] command.append("sox") command.extend(files_to_concatenate) command.append("gtzan_combined.wav") subprocess.check_output(command) subprocess.call(['chmod', '-R', '777', './music_speech']) shutil.rmtree("./music_speech")
def generate_face_based_segmentation(youtube_video_id, images_dir, lbls_dir, faces, predictor_path, face_rec_model_path, tmp_dir): images_raw = glob(os.path.join(images_dir, "*.jpg")) images_raw.sort() # images_raw = images_raw[0:100] images = [ images_raw[i] for i in range(0, len(images_raw), FRAMES_PER_STEP) ] print(images) timestamps = [i * (FRAMES_PER_STEP / 25.0) for i in range(0, len(images))] print(timestamps) detector = get_frontal_face_detector() sp = dlib.shape_predictor(predictor_path) facerec = face_recognition_model_v1(face_rec_model_path) embeddings = [] embeddings_timestamps = [] landmarks_parts = [] landmarks_rect = [] embeddings_pickle = os.path.join(tmp_dir, "embeddings.npy") embeddings_timestamps_pickle = os.path.join(tmp_dir, "embeddings_timestamps.npy") if not os.path.isfile(embeddings_pickle) or not os.path.isfile( embeddings_timestamps_pickle): for frame_no, f in enumerate(images): print("Processing file: {}".format(f)) img = io.imread(f) dets = detector(img, 1) print("Number of faces detected: {}".format(len(dets))) for k, d in enumerate(dets): shape = sp(img, d) face_descriptor = facerec.compute_face_descriptor(img, shape) embeddings.append(face_descriptor) embeddings_timestamps.append(timestamps[frame_no]) landmarks_parts.append(shape.parts()) landmarks_rect.append(shape.rect) embeddings = numpy.array(embeddings) embeddings_timestamps = numpy.array(embeddings_timestamps) numpy.save(embeddings_pickle, embeddings) numpy.save(embeddings_timestamps_pickle, embeddings_timestamps) else: embeddings = numpy.load(embeddings_pickle) embeddings_timestamps = numpy.load(embeddings_timestamps_pickle) print(embeddings.shape) print(embeddings_timestamps.shape) if len(embeddings) == 0: Util.write_audacity_labels([], os.path.join( lbls_dir, youtube_video_id + ".image.txt")) return kmeans = KMeans(n_clusters=faces) kmeans.fit(embeddings) predictions = numpy.array(kmeans.labels_.tolist()) df = pd.DataFrame({ "timestamps": embeddings_timestamps.tolist(), "predictions": predictions }) timestamps = [] classes = [] for key, group in df.groupby(['timestamps']): timestamps.append(key) classes.append(",".join( [str(i) for i in sorted(group['predictions'].tolist())])) lbls = Util.generate_labels_from_classifications(classes, timestamps) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".image.txt"))
def generate_audio_based_segmentation(audio_file, w, h, embedding_size, lstm_nodes, dropout, weights_filename, scaler_filename, window_size, step, hop_size, youtube_video_id, lbls_dir, clusters=4, sr=16000): vad = Vad() vad_lbls = vad.detect_voice_segments(audio_file) with open(scaler_filename, 'rb') as f: scaler = pickle.load(f) model, intermediate = get_lstm_siamese((w, h), embedding_size, lstm_nodes, dropout) model.load_weights(weights_filename) feature_extractor = FeatExtractorMFCC(window_size, hop_size, w, sr, h, step=step) X, timestamps = feature_extractor.extract(audio_file) timestamps = numpy.array(timestamps) window = timestamps[1] - timestamps[0] frame_predictions = [] for k, timestamp in enumerate(timestamps): found = False for lbl in vad_lbls: if lbl.start_seconds <= timestamp <= lbl.end_seconds - window: # need the window end to fit in the label frame_predictions.append(lbl.label) found = True break if not found: frame_predictions.append('non_speech') frame_predictions = numpy.array(frame_predictions) print(frame_predictions.shape) print(timestamps.shape) speech_indices = numpy.where(frame_predictions == 'speech') X_speech = X[speech_indices] timestamps_speech = timestamps[speech_indices] # Util.write_audacity_labels(Util.generate_labels_from_classifications(frame_predictions, timestamps), # "vad_preds_quant.txt") X = X_speech.reshape((X_speech.shape[0] * w, h)) X = scaler.transform(X) X = X.reshape(-1, w, h) original_embeddings = intermediate.predict(X) clustering_algorithm = GaussianMixture(n_components=clusters, max_iter=1000, n_init=3) # if visualise: # # embeddings, y, classes, new_timestamps = Util.get_annotated_data_x_y(timestamps_speech, original_embeddings, # lbls_fixed) # le = preprocessing.LabelEncoder() # y = le.fit_transform(y) # # tsne = TSNE() # two_dimensional = tsne.fit_transform(embeddings) # # # pca = PCA(n_components=2) # # two_dimensional = pca.fit_transform(embeddings) # # # pca2 = PCA(n_components=20) # # pca_embeddings = pca2.fit_transform(embeddings) # # clustering_algorithm.fit(two_dimensional) # predictions = clustering_algorithm.predict(two_dimensional) # # # kmeans = KMeans(n_clusters=CLUSTERS) # # kmeans.fit(embeddings) # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=y, marker='.') # # plt.figure(figsize=(10, 6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # # else: tsne = TSNE(n_components=2, init='pca') two_dimensional = tsne.fit_transform(original_embeddings) # original_embeddings = scale((original_embeddings)) # pca = PCA(n_components=2) # two_dimensional = pca.fit_transform(original_embeddings) # pca2 = PCA(n_components=3) # pca_embeddings = pca2.fit_transform(original_embeddings) clustering_algorithm.fit(two_dimensional) predictions = clustering_algorithm.predict(two_dimensional) # kmeans = KMeans(n_clusters=CLUSTERS) # kmeans.fit(two_dimensional) # plt.figure(figsize=(10,10)) # plt.imshow(calculate_similarity_matrix(two_dimensional, metric='euclidean'), cmap='gray') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], two_dimensional[:, 1], c=predictions, marker='.') # plt.figure(figsize=(10,6)) # plt.scatter(two_dimensional[:, 0], pca_embeddings[:, 1], c=kmeans.labels_.tolist(), marker='.') # predictions = kmeans.labels_.tolist() for k, speech_index in enumerate(speech_indices[0]): frame_predictions[speech_index] = predictions[k] lbls = Util.generate_labels_from_classifications(frame_predictions, timestamps) json_lbls = [] for lbl in lbls: json_lbls.append({ "start_seconds": lbl.start_seconds, "end_seconds": lbl.end_seconds, "label": lbl.label }) with open(os.path.join(lbls_dir, youtube_video_id + ".json"), 'w') as outfile: json.dump(json_lbls, outfile) Util.write_audacity_labels( lbls, os.path.join(lbls_dir, youtube_video_id + ".txt"))
for f in music_wavs: all_files_dict[f] = "m" random.seed(1111) all_files_random_keys = random.sample(all_files_dict.keys(), len(all_files_dict.keys())) last_seconds = 0 files_to_concatenate = [] labels = [] for v in all_files_random_keys: duration = float(subprocess.check_output(["soxi", "-D", v]).strip()) segment_start_time = last_seconds segment_end_time = last_seconds + duration last_seconds += duration labels.append(AudacityLabel(segment_start_time, segment_end_time, all_files_dict[v])) files_to_concatenate.append(v) audacity_labels = Util.combine_adjacent_labels_of_the_same_class(labels) Util.write_audacity_labels(audacity_labels, "labrosa_combined.txt") command = [] command.append("sox") command.extend(files_to_concatenate) command.append("labrosa_combined.wav") subprocess.check_output(command) subprocess.call(['chmod', '-R', '777', './music-speech']) shutil.rmtree("./music-speech")