def get_embedding(audio_path): times, segs = VAD_chunk(2, audio_path) #print("segs ", segs.shape) if segs == []: print('No voice activity detected') return None concat_seg = concat_segs(times, segs) STFT_frames = get_STFTs(concat_seg) STFT_frames = np.stack(STFT_frames, axis=2) STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2, 1, 0))) #print("STFT shape: ", STFT_frames.shape) embeddings = encoder(STFT_frames) return embeddings
def dvector_make(file, embedder_net): times, segs = VAD_chunk(2, file) if segs == []: print('No voice activity detected') concat_seg = concat_segs(times, segs) STFT_frames = get_STFTs(concat_seg) STFT_frames = np.stack(STFT_frames, axis=2) STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2, 1, 0))) embeddings = embedder_net(STFT_frames) aligned_embeddings = np.array(align_embeddings( embeddings.detach().numpy())) dvector = np.mean(aligned_embeddings, axis=0) dvector = dvector / np.linalg.norm(dvector, 2) return dvector
def feature_embeddings(model_path): for i, folder in enumerate(audio_path): print("&" * 10, folder, "&" * 100) for file in os.listdir(folder): print(file) if file[-4:] == '.wav': # subprocess.call(['ffmpeg', '-i', 'file', file[-4:]+'.wav']) times, segs = VAD_chunk(2, folder + '/' + file) # print("times" * 10, times) # print("segs" * 10) #print(segs) if segs == []: print('No voice activity detected') continue concat_seg = concat_segs(times, segs) STFT_frames = get_STFTs(concat_seg) STFT_frames = np.stack(STFT_frames, axis=2) STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2, 1, 0))) embeddings = embedder_net(STFT_frames) # print(embeddings) break aligned_embeddings = align_embeddings(embeddings.detach().numpy()) train_sequence.append(aligned_embeddings) for embedding in aligned_embeddings: train_cluster_id.append(str(label)) count = count + 1 if count % 100 == 0: print('Processed {0}/{1} files'.format(count, len(audio_path))) label = label + 1 if not train_saved and i > train_speaker_num: train_sequence = np.concatenate(train_sequence, axis=0) train_cluster_id = np.asarray(train_cluster_id) np.save('train_sequence', train_sequence) np.save('train_cluster_id', train_cluster_id) train_saved = True train_sequence = [] train_cluster_id = []
10) * 9 # split total data 90% train and 10% test embedder_net = SpeechEmbedder() embedder_net.load_state_dict(torch.load(hp.model.model_path)) embedder_net.eval() train_sequence = [] train_cluster_id = [] label = 0 count = 0 train_saved = False for i, folder in enumerate(audio_path): for file in os.listdir(folder): print('folder') if file[-4:] == '.WAV': times, segs = VAD_chunk(2, folder + '/' + file) if segs == []: print('No voice activity detected') continue concat_seg = concat_segs(times, segs) STFT_frames = get_STFTs(concat_seg) STFT_frames = np.stack(STFT_frames, axis=2) STFT_frames = torch.tensor( np.transpose(STFT_frames, axes=(2, 1, 0))) embeddings = embedder_net(STFT_frames) aligned_embeddings = align_embeddings(embeddings.detach().numpy()) train_sequence.append(aligned_embeddings) for embedding in aligned_embeddings: train_cluster_id.append(str(label)) count = count + 1 if count % 100 == 0:
all_speakers.append(speaker) # Get duration if debug: print('\n==== DEBUG ====') print(fpath) try: duration = str(timedelta(seconds=get_duration(filename=fpath))) if debug: print('File duration: {}'.format(duration)) except: print('UNABLE TO GET DURATION') raise # Chunk into segments with speech audio times, segs = VAD_chunk(2, fpath) if segs == []: print('No voice activity detected') continue if debug: print('{} - {:,} segments'.format(datetime.now() - start, len(segs))) # Short-term Fourier Transform concat_seg = concat_segs(times, segs) if debug: print('{} - Concatenated segments'.format(datetime.now() - start)) STFT_frames = get_STFTs(concat_seg) if debug: print('{} - Got STFT frames'.format(datetime.now() - start))
embedder_net.eval() embedder_net = embedder_net.cuda() train_sequence = [] train_cluster_id = [] label = 0 count = 0 ns = 0 nf = 0 train_saved = False for i, folder in enumerate(tqdm.tqdm(audio_path, desc="meeting", position=0)): for file in tqdm.tqdm(os.listdir(folder), desc="Segment", position=1): if file.split('.')[-1] == 'wav': # 0 is least agressive about filtering out non-speech, 3 is the most times, segs = VAD_chunk(hp.data.aggressiveness, os.path.join(folder, file)) if segs == []: ns += 1 continue concat_seg = concat_segs(times, segs) STFT_frames = get_STFTs(concat_seg) if len(STFT_frames) == 0: nf += 1 continue STFT_frames = np.stack(STFT_frames, axis=2) STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2,1,0))) embeddings = embedder_net(STFT_frames.cuda()) aligned_embeddings = align_embeddings(embeddings.cpu().detach().numpy()) train_sequence.append(aligned_embeddings) for embedding in aligned_embeddings: train_cluster_id.append(str(label))
use_label = judge_dict[spkr_name[:-15]] else: judge_dict[spkr_name[:-15]] = cnt use_label = judge_dict[spkr_name[:-15]] cnt += 1 else: use_label = label label += 1 spkr_file_lst = [] spkr_sequence = [] spkr_cluster_lst = [] for file in os.listdir(folder + '/' + spkr_name): if file[-4:] == '.wav': times, segs = VAD_chunk(2, folder + '/' + spkr_name + '/' + file) # Bad .wav detection if segs == []: rm_pthlst.append(folder + '/' + file) continue concat_seg = concat_segs(times, segs) if len(concat_seg) < min_va: rm_pthlst.append(folder + '/' + file) continue STFT_frames = get_STFTs(concat_seg) STFT_frames = np.stack(STFT_frames, axis=2) STFT_frames = torch.tensor( np.transpose(STFT_frames, axes=(2, 1, 0)))
def two_person_diarization(call_file): """ Diarization of a call of 2 speakers. Input: path to a call Output: a dictionary of two np arrays of timestamps in seconds - one per speaker. """ # create embeddings and cluster them seg_times, speech_segs = VAD_chunk(2, call_file) assert speech_segs != [], "No voice apctivity detected." all_embeddings = [] embedding_times = [] for speech_seg, seg_time in zip(speech_segs, seg_times): STFT_frames = signal_processing.split_segment_to_frames(speech_seg) if not STFT_frames: # not enough frames, continue to next segment continue STFT_frames = np.stack(STFT_frames, axis=1) embeddings = speaker_verification_lstm_model.extract_embedding( STFT_frames) # calculate time stamps for each embedding delta_t = (seg_time[1] - seg_time[0]) / embeddings.shape[0] times_start = np.linspace(seg_time[0], seg_time[1], num=embeddings.shape[0]) times_end = np.linspace(seg_time[0] + delta_t, seg_time[1] + delta_t, num=embeddings.shape[0]) for idx, embedding in enumerate(embeddings): all_embeddings.append(embedding) embedding_times.append([times_start[idx], times_end[idx]]) # Using K-Means to separate the two speakers embeddings kmeans_emb = KMeans(n_clusters=2, random_state=0).fit(all_embeddings) # Getting the cluster labels labels = kmeans_emb.predict(all_embeddings) # Taking only the embeddings that are close to the centers distances_from_centers = [] distances_from_centers.append([]) distances_from_centers.append([]) for embedding, label in zip(all_embeddings, labels): distances_from_centers[label].append( scipy.spatial.distance.euclidean( embedding, kmeans_emb.cluster_centers_[label])) # Take only the most certain segments median_dist = [] median_dist.append(statistics.median(distances_from_centers[0])) median_dist.append(statistics.median(distances_from_centers[1])) # Create list of times and embedding per speaker first_speaker_label = labels[0] speaker0_times = [] speaker1_times = [] speaker0_embeddings = [] speaker1_embeddings = [] for idx, label in enumerate(labels): if scipy.spatial.distance.euclidean( all_embeddings[idx], kmeans_emb.cluster_centers_[label]) < median_dist[label]: if label == first_speaker_label: speaker0_times.append(embedding_times[idx]) speaker0_embeddings.append(all_embeddings[idx]) else: speaker1_times.append(embedding_times[idx]) speaker1_embeddings.append(all_embeddings[idx]) # build dictionary to return return { "FIRST SPEAKER": speaker0_times, "SECOND SPEAKER": speaker1_times }, { "FIRST SPEAKER": speaker0_embeddings, "SECOND SPEAKER": speaker1_embeddings }