def perform_clustering(embeddings, time_stamps, speakers, audio_rttm_map, out_rttm_dir): """ performs spectral clustering on embeddings with time stamps generated from VAD output Args: embeddings (dict): Embeddings with key as unique_id time_stamps (dict): time stamps list for each audio recording speakers (dict): number of speaker for each audio recording audio_rttm_map (dict): AUDIO_RTTM_MAP for mapping unique id with audio file path and rttm path out_rttm_dir (str): Path to write predicted rttms Returns: all_reference (list[Annotation]): reference annotations for score calculation all_hypothesis (list[Annotation]): hypothesis annotations for score calculation """ all_hypothesis = [] all_reference = [] no_references = False for uniq_key in embeddings.keys(): NUM_speakers = speakers[uniq_key] if NUM_speakers >= 2: emb = embeddings[uniq_key] emb = np.asarray(emb) cluster_method = SpectralClusterer(min_clusters=NUM_speakers, max_clusters=NUM_speakers) cluster_labels = cluster_method.predict(emb) lines = time_stamps[uniq_key] assert len(cluster_labels) == len(lines) for idx, label in enumerate(cluster_labels): tag = 'speaker_' + str(label) lines[idx] += tag a = get_contiguous_stamps(lines) labels = merge_stamps(a) if out_rttm_dir: labels_to_rttmfile(labels, uniq_key, out_rttm_dir) hypothesis = labels_to_pyannote_object(labels) all_hypothesis.append(hypothesis) rttm_file = audio_rttm_map[uniq_key]['rttm_path'] if os.path.exists(rttm_file) and not no_references: ref_labels = rttm_to_labels(rttm_file) reference = labels_to_pyannote_object(ref_labels) all_reference.append(reference) else: no_references = True all_reference = [] return all_reference, all_hypothesis
def cluster_seg(order_list, min_cluster, max_cluster, n_features, n_mfcc): """ Clustering small audio files based on their feature vectors (MFCCs) for speaker verification Args: order_list: list of splited audio files min_cluster: minimum number of cluster max_cluster: maximum number of cluster n_features: No. feature be chose for clustering n_mfcc: No. of MFCCs features Return: labels: Speaker verification for each audio files. """ # Create an empty matrix with fixed-size (n_sample, n_feature) n_feat = np.zeros((len(order_list), n_features * n_mfcc)) for idx, audio in enumerate(order_list): data, sr = li.load(audio, 16000) # Calculating MFCCs features mfccs = li.feature.mfcc(data, sr, n_mfcc=n_mfcc) mfccs = np.array(mfccs) # Resize MFCCs and flatten mfccs = np.resize(mfccs, (n_mfcc, n_features)) mfccs = mfccs.flatten() n_feat[idx, :] = mfccs print(n_feat.shape) # Apply K-means to cluster audio files by their features vectors (MFCCs) clusterer = SpectralClusterer(min_clusters=min_cluster, max_clusters=max_cluster) labels = clusterer.predict(n_feat) labels = [str(x) for x in labels] # Convert from cluster labels to speaker identification speaker_tag = {} n = 1 speaker_tag[labels[0]] = f'Speaker {n}: ' for s in labels: if s not in speaker_tag: n += 1 speaker_tag[s] = f'Speaker {n}: ' labels = [speaker_tag[i] for i in labels] return labels
def signalLabelPrediction(cont_embeds): # Finding optimum clusters c wcss = [] r = range(1, 15) for k in r: km = KMeans(n_clusters=k) km = km.fit(cont_embeds) wcss.append(km.inertia_) kn = KneeLocator(list(r), wcss, S=1.0, curve='convex', direction='decreasing') c=kn.knee # Passing the clusters to get the labels clusterer = SpectralClusterer( min_clusters=c, max_clusters=100, p_percentile=0.90, gaussian_blur_sigma=1) labels = clusterer.predict(cont_embeds) return(labels)
def clustering(wav): encoder = VoiceEncoder("cpu") _, cont_embeds, wav_splits = encoder.embed_utterance( wav, return_partials=True, rate=16) #create d-vectors clusterer = SpectralClusterer(min_clusters=2, max_clusters=100, p_percentile=0.90, gaussian_blur_sigma=1) labels = clusterer.predict(cont_embeds) times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits] labelling = [] start_time = 0 for i, time in enumerate(times): if i > 0 and labels[i] != labels[i - 1]: temp = ['speaker ' + str(labels[i - 1]), start_time, time] labelling.append(tuple(temp)) start_time = time if i == len(times) - 1: temp = ['speaker ' + str(labels[i]), start_time, time] labelling.append(tuple(temp)) return labelling #str
# min_clusters=2, # max_clusters=3, # p_percentile=0.99, # gaussian_blur_sigma=3, # thresholding_soft_multiplier=0.5, # stop_eigenvalue=10e-5) clusterer = SpectralClusterer( min_clusters=2, max_clusters=3, p_percentile=0.99, gaussian_blur_sigma=3, thresholding_soft_multiplier=0.5, stop_eigenvalue=10e-5) labels, embeddings, centers = clusterer.predict(X_cluster) #### SAVING RESULTS np.savetxt(dir_audio + 'labels.npy', labels) with open(dir_audio + 'order_pred.txt', "w") as f: for s in order_pred: f.write(str(s) +"\n") #### PLOTTING CLUSTER plt.figure(1) plt.title('Spectral Cluster prediction') plt.scatter(embeddings[:, 0], embeddings[:, 1],c=labels, s=50, cmap='viridis') plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.5) plt.ylabel('Embeddings 1')
autotune = AutoTune(p_percentile_min=0.60, p_percentile_max=0.95, init_search_step=0.01, search_level=3) icassp2018_clusterer = SpectralClusterer( min_clusters=2, max_clusters=18, autotune=None, laplacian_type=None, refinement_options=icassp2018_refinement_options, custom_dist="cosine") for idx, sample_id in enumerate(sample_ids): labels = icassp2018_clusterer.predict(sequences[idx]) print('Predicted labels: ', sample_id, f' {idx+1}/{len(sample_ids)}') annotation = Annotation() annotation.uri = sample_id for jdx, speaker_id in enumerate(labels): segment_interval = intervals[idx][jdx] annotation[Segment(segment_interval[0], segment_interval[1])] = speaker_id rttm_file = '{}/{}.rttm'.format(rttm_dir, sample_id) with open(rttm_file, 'w') as file: annotation.support().write_rttm(file) # rttm_file_collar = '{}/rttm_colar/{}.rttm'.format(rttm_dir, sample_id) # with open(rttm_file_collar, 'w') as file:
encoder = VoiceEncoder("cpu") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) print(cont_embeds.shape) #################################################################################################### # D vector clustering clusterer = SpectralClusterer( min_clusters=2, max_clusters=100, p_percentile=0.90, gaussian_blur_sigma=1) labels = clusterer.predict(cont_embeds) #################################################################################################### # Create Continuous Segments def create_labelling(labels, wav_splits): from resemblyzer import sampling_rate times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits] labelling = [] start_time = 0 for i, time in enumerate(times): if i > 0 and labels[i] != labels[i - 1]: temp = [str(labels[i - 1]), start_time, time] labelling.append(tuple(temp))
for p_percentile in p_percentiles: for gaussian_blur_sigma in gaussian_blur_sigmas: for thresholding_soft_multiplier in thresholding_soft_multipliers: for stop_eigenvalue in stop_eigenvalues: # print(' Values: p_percentile=' + str(p_percentile) + ' gaussian_blur_sigma=' + str(gaussian_blur_sigma) + ' thresholding_soft_multiplier=' + str(thresholding_soft_multiplier) + ' stop_eigenvalue=' + str(stop_eigenvalue)) clusterer = SpectralClusterer( min_clusters=2, max_clusters=10, p_percentile=p_percentile, gaussian_blur_sigma=gaussian_blur_sigma, thresholding_soft_multiplier= thresholding_soft_multiplier, stop_eigenvalue=stop_eigenvalue) labels = clusterer.predict(X_cluster) speaker_1 = [] speaker_2 = [] speaker = [] for i in range(len(labels)): pos = int(order_list[i][8:12]) - 1 num = int(order_list[i][13:17]) - 1 ini, final = times[pos].split('\t') if round( float(ini) + ((num) * (window_t * (1 - overlap)) + window_t), 4) < float(final): interval = [
def get_full_translation(audio_file_name): def mp3_to_wav(file): if audio_file_name.split('.')[1] == 'mp3': sound = AudioSegment.from_mp3(audio_file_name) sound.export(f"{audio_file_name.split('.')[0]}.wav", format="wav") else: pass wav_fpath = (f"{audio_file_name.split('.')[0]}.wav") #preprocesses the wave file to turn it into a file without sounds wav = preprocess_wav(wav_fpath) encoder = VoiceEncoder( "cpu" ) #Creates a voice encoder object so we can process audio with the cpu _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) #create a cluster object clusterer = SpectralClusterer(min_clusters=2, max_clusters=100, p_percentile=0.90, gaussian_blur_sigma=1) #label the speaker that is speaking at certain time labels = clusterer.predict(cont_embeds) def create_labelling(labels, wav_splits): times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits] labelling = [] start_time = 0 for i, time in enumerate(times): if i > 0 and labels[i] != labels[i - 1]: temp = [str(labels[i - 1]), start_time, time] labelling.append(tuple(temp)) start_time = time if i == len(times) - 1: temp = [str(labels[i]), start_time, time] labelling.append(tuple(temp)) return labelling labelling = create_labelling(labels, wav_splits) print(labelling) #read data from the wave file def split_audio(file_path): thisList = [] n = 0 for i in labelling: n = +1 #changes the values in the tuple to ints and milliseconds start = int(i[1]) * 1000 end = int(i[2]) * 1000 #finds the audio file as a wav newAud = AudioSegment.from_wav(file_path) #creates a new audio based on the values given by seconds newAudio = newAud[start:end] newAudio.export( f'audio_files/split/SplitAudio_{n}.wav', format="wav") #Exports to a wav file in the current path. return n n = split_audio(audio_file_name) def get_translation(): entire_text = [] for num in range(n): r = sr.Recognizer() harvard = sr.AudioFile(f'audio_files/split/SplitAudio_{num+1}.wav') with harvard as source: r.adjust_for_ambient_noise(source) r.enable_separate_recognition_per_channel = True audio = r.record(source) text = r.recognize_google(audio, language='en-US') text = f'Speaker {n}: ' + str(text) entire_text.append(text) return entire_text translation = get_translation() return render_template('translation.html', translation=translation)
def spectral_cluster( vad_results, speaker_vector, min_clusters: int = None, max_clusters: int = None, p_percentile: float = 0.95, gaussian_blur_sigma=1.0, norm_function: Callable = l2_normalize, log_distance_metric: str = None, return_embedding=False, **kwargs, ): """ Speaker diarization using SpectralCluster, https://github.com/wq2012/SpectralCluster Parameters ---------- vad_results: List[Tuple[Frame, label]] results from VAD. speaker_vector: callable speaker vector object. min_clusters: int, optional (default=None) minimal number of clusters allowed (only effective if not None). max_clusters: int, optional (default=None) maximal number of clusters allowed (only effective if not None). can be used together with min_clusters to fix the number of clusters. gaussian_blur_sigma: float, optional (default=1.0) sigma value of the Gaussian blur operation. p_percentile: float, optional (default=0.95) the p-percentile for the row wise thresholding. norm_function: Callable, optional(default=malaya_speech.utils.dist.l2_normalize) normalize function for speaker vectors. log_distance_metric: str, optional (default=None) post distance norm in log scale metrics. Returns ------- result : List[Tuple[Frame, label]] """ try: from spectralcluster import SpectralClusterer except: raise ModuleNotFoundError( 'spectralcluster not installed. Please install it by `pip install spectralcluster` and try again.' ) clusterer = SpectralClusterer( min_clusters=min_clusters, max_clusters=max_clusters, p_percentile=p_percentile, gaussian_blur_sigma=gaussian_blur_sigma, **kwargs, ) speakers, activities, mapping = [], [], {} for no, result in enumerate(vad_results): if result[1]: speakers.append('got') mapping[len(activities)] = no vector = speaker_vector([result[0]])[0] activities.append(vector) else: speakers.append('not a speaker') activities = np.array(activities) if norm_function: activities = norm_function(activities) if log_distance_metric: activities = compute_log_dist_matrix(activities, log_distance_metric) cluster_labels = clusterer.predict(activities) for k, v in mapping.items(): speakers[v] = f'speaker {cluster_labels[k]}' results = [] for no, result in enumerate(vad_results): results.append((result[0], speakers[no])) if return_embedding: return results, activities else: return results
def Diariazation(path, filename): src = path # dst = str(filename)+".wav" dst = path # if(path.split(".")[1]!='wav'): # # dst=convertmp3towav(path,filename) # return {"msg":"Only Wav files are Supported"} # sound = AudioSegment.from_mp3(src) # sound.export(dst, format="wav") audio_file_path = dst wav_fpath = Path(audio_file_path) wav = preprocess_wav(wav_fpath) encoder = VoiceEncoder("cpu") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) clusterer = SpectralClusterer(min_clusters=1, max_clusters=100, p_percentile=0.90, gaussian_blur_sigma=1) labels = clusterer.predict(cont_embeds) def create_labelling(labels, wav_splits): times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits] labelling = [] start_time = 0 for i, time in enumerate(times): if i > 0 and labels[i] != labels[i - 1]: temp = [str(labels[i - 1]), start_time, time] labelling.append(tuple(temp)) start_time = time if i == len(times) - 1: temp = [str(labels[i]), start_time, time] labelling.append(tuple(temp)) return labelling labelling = create_labelling(labels, wav_splits) dd = defaultdict(list) for tpl in labelling: # print(type(tpl)) # print(tpl[0]) dd[tpl[0]].append([tpl[1], tpl[2]]) transcript_list = defaultdict(list) # t1 = t1 * 1000 #Works in milliseconds # t2 = t2 * 1000 split_audio_path = defaultdict(list) for speaker in dd.keys(): ind = 0 for duration in dd[speaker]: l = len(split_audio_path[speaker]) t1 = duration[0] * 1000 t2 = duration[1] * 1000 newAudio = AudioSegment.from_wav(audio_file_path) newAudio = newAudio[t1:t2] save_path = './Audio/AudioD/user_' + str(speaker) + str(l) + '.wav' newAudio.export( save_path, format="wav") #Exports to a wav file in the current path. split_audio_path[speaker].append(save_path) ind += 1 import speech_recognition as sr r = sr.Recognizer() for speaker in split_audio_path.keys(): for path in split_audio_path[speaker]: with sr.WavFile( path) as source: # use "test.wav" as the audio source audio = r.record(source) # extract audio data from the file try: transcript = r.recognize(audio) print("Transcription: " + transcript ) # recognize speech using Google Speech Recognition transcript_list[speaker].append(transcript) except LookupError: # speech is unintelligible print("Could not understand audio") return {"intervals": dd, "list": transcript_list}
args = parser.parse_args() audio_path = Path(args.audio) wav = preprocess_wav(audio_path) encoder = VoiceEncoder() if args.rate <= 4 else VoiceEncoder("cpu") # encoder = VoiceEncoder("cpu") _, cond_emd, wav_splits = encoder.embed_utterance( wav, return_partials=True, rate=args.rate, ) clusterer = SpectralClusterer( min_clusters=args.num, p_percentile=0.91, ) labels = clusterer.predict(cond_emd) times = np.array([(s.start + s.stop) / 2 / sampling_rate for s in wav_splits]) if args.interactive: interactive_diarization(times, labels, wav, labels.max() + 1) else: time_intervals = get_time_intervals(times, labels) log_speaker_diary(time_intervals)