def test_fit_concatenated_and_predict_single_label(self): """Train and test model while training data has single label. Training data have already been concatenated. """ model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.rnn_depth = 1 model_args.rnn_hidden_size = 8 model_args.observation_dim = 16 training_args.learning_rate = 0.01 training_args.train_iteration = 50 inference_args.test_iteration = 1 # generate fake training data, assume already concatenated train_sequence = np.random.rand(1000, model_args.observation_dim) train_cluster_id = np.array(['A'] * 1000) model = uisrnn.UISRNN(model_args) # training model.fit(train_sequence, train_cluster_id, training_args) # testing, where data has less variation than training test_sequence = np.random.rand(10, model_args.observation_dim) / 10.0 predicted_label = model.predict(test_sequence, inference_args) self.assertListEqual([0] * 10, predicted_label)
def test_fit_list_and_predict_single_label(self): """Train and test model while training data has single label. Training data are not concatenated. """ model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.enable_cuda = False model_args.rnn_depth = 1 model_args.rnn_hidden_size = 8 model_args.observation_dim = 16 training_args.learning_rate = 0.01 training_args.train_iteration = 50 inference_args.test_iteration = 1 # generate fake training data, as a list train_sequences = [ np.random.rand(100, model_args.observation_dim), np.random.rand(200, model_args.observation_dim), np.random.rand(300, model_args.observation_dim)] train_cluster_ids = [ np.array(['A'] * 100), np.array(['A'] * 200), np.array(['A'] * 300),] model = uisrnn.UISRNN(model_args) # training model.fit(train_sequences, train_cluster_ids, training_args) # testing, where data has less variation than training test_sequence = np.random.rand(10, model_args.observation_dim) / 10.0 predicted_label = model.predict(test_sequence, inference_args) self.assertListEqual([0] * 10, predicted_label)
def main(): """The main function.""" # Retrieve arguments model_args, training_args, \ inference_args, data_args = uisrnn.parse_arguments() # Run experiment diarization_experiment(model_args, training_args, inference_args, data_args)
def test_save_and_load(self): """Save model and load it.""" model_args, _, _ = uisrnn.parse_arguments() model_args.observation_dim = 16 model_args.transition_bias = 0.5 model_args.sigma2 = 0.05 model = uisrnn.UISRNN(model_args) temp_file_path = tempfile.mktemp() model.save(temp_file_path) model.load(temp_file_path) self.assertEqual(0.5, model.transition_bias)
def main(): """The main function.""" model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 model_args.rnn_depth = 1 model_args.rnn_hidden_size = 512 training_args.enforce_cluster_id_uniqueness = True training_args.batch_size = 30 training_args.learning_rate = 1e-4 training_args.train_iteration = 10 training_args.num_permutations = 20 # training_args.grad_max_norm = 5.0 training_args.learning_rate_half_life = 1000 diarization_experiment(model_args, training_args, inference_args)
def diarize(segments, sr=16000, win_len=400, hop_len=160, embedding_per_sec=1.0, overlap_rate=0.1): logger.debug("[Speaker diarization] Initializing models") # Initialize ghostvlad toolkits.initialize_GPU(Expando({"gpu": ""})) ghostvlad_model = model.vggvox_resnet2d_icassp(input_dim=(257, None, 1), num_class=5994, mode="eval", args=Expando({"net": "resnet34s", "loss": "softmax", "vlad_cluster": 8, "ghost_cluster": 2, "bottleneck_dim": 512, "aggregation_mode": "gvlad"})) ghostvlad_model.load_weights("ghostvlad/pretrained/weights.h5", by_name=True) # Initialize uisrnn sys.argv = sys.argv[:1] model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnn_model = uisrnn.UISRNN(model_args) uisrnn_model.load("uisrnn/pretrained/saved_model.uisrnn_benchmark") logger.debug("[Speaker diarization] Calculating utterance features") utterances_spec = prepare_ghostvlad_data(segments, sr, win_len, hop_len, embedding_per_sec, overlap_rate) feats = [] for spec in utterances_spec: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = ghostvlad_model.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) logger.debug("[Speaker diarization] Clustering utterance features") labels = uisrnn_model.predict(feats, inference_args) logger.debug("[Speaker diarization] Tagging segments speakers") embedding_duration = (1/embedding_per_sec) * (1.0 - overlap_rate) labels_count = len(labels) current = 0 for segment in segments: begin_index = math.floor(current/embedding_duration) current += segment.end-segment.begin end_index = math.ceil(current/embedding_duration) segment_labels = [labels[index] for index in range(begin_index, min(end_index, labels_count))] if len(segment_labels) > 0: segment.speaker = max(segment_labels, key=segment_labels.count) else: segment.speaker = 999 return segments
def test_fit_with_wrong_dim(self): """Training data has wrong dimension.""" model_args, training_args, _ = uisrnn.parse_arguments() model_args.rnn_depth = 1 model_args.rnn_hidden_size = 8 model_args.observation_dim = 16 training_args.learning_rate = 0.01 training_args.train_iteration = 5 # generate fake data train_sequence = np.random.rand(1000, 18) train_cluster_id = np.array(['A'] * 1000) model = uisrnn.UISRNN(model_args) # training with self.assertRaises(ValueError): model.fit(train_sequence, train_cluster_id, training_args)
def test_predict_with_wrong_dim(self): """Testing data has wrong dimension.""" model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.enable_cuda = False model_args.rnn_depth = 1 model_args.rnn_hidden_size = 8 model_args.observation_dim = 16 training_args.learning_rate = 0.01 training_args.train_iteration = 50 # generate fake data train_sequence = np.random.rand(1000, model_args.observation_dim) train_cluster_id = np.array(['A'] * 1000) model = uisrnn.UISRNN(model_args) # training model.fit(train_sequence, train_cluster_id, training_args) # testing test_sequence = np.random.rand(10, 18) with self.assertRaises(ValueError): model.predict(test_sequence, inference_args)
# from matplotlib import cm # from time import sleep, perf_counter as timer # from umap import UMAP # import matplotlib.pyplot as plt sys.path.append("Resemblyzer") from resemblyzer import preprocess_wav, VoiceEncoder, sampling_rate # noqa # %% # Load file wav = preprocess_wav("Resemblyzer/audio_data/X2zqiX6yL3I.mp3") # %% # Audio features encoder = VoiceEncoder("cpu") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=5) # %% # Load UIS-RNN model sys.argv = ['dummy'] model_args, training_args, inference_args = uisrnn.parse_arguments() model = uisrnn.UISRNN(model_args) model.load('uis-rnn/saved_model.uisrnn') # %% # Testing test_sequence = cont_embeds.astype(float) predictions = model.predict(test_sequence, inference_args) # %%
'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) while (True): print("Start speaking") myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2) print(type(myrecording)) sd.wait() # Wait until recording is finished0000 print("Finished recording") write('wavs/output12.wav', fs, myrecording) initial_time = datetime.now() main(r'wavs/output12.wav', embedding_per_second=1.2, overlap_rate=0.4) print("time taken for execution is : " + str(datetime.now() - initial_time)) num = int(input("Enter 1 to continue and 0 to exit"))
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) #specs1,interval1 = load_data(r'wavs/REC20190716140159.wav', embedding_per_second=1.2, overlap_rate=0.4) #mapTable1,keys1 =genMap(interval1) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] # ============================================================================= # for spec1 in specs1: # spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1) # v = network_eval.predict(spec1) # feats += [v] # ============================================================================= feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] #print(len(feats),'00000000') #predicted_label = uisrnnModel.predict(feats, inference_args) #silhoutte score # ============================================================================= # sli=[] # fromsel=[] # li=[] # knum=[] # for i in range(10): # li=[] # range_n_clusters = list (range(2,5)) # for n_clusters in range_n_clusters: # clusterer = KMeans(n_clusters=n_clusters) # preds = clusterer.fit_predict(feats) # centers = clusterer.cluster_centers_ # # score = silhouette_score (feats, preds, metric='euclidean') # print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score)) # li.append([n_clusters,score,clusterer,centers]) # # ============================================================================= # # print([float(str(i[1])[:4]) for i in li]) # # kvalue=(max([float(str(i[1])[:4]) for i in li])) # # for i in range(len(li)): # # if kvalue==float(str(li[i][1])[:4]): # # true_k=li[i][0] # # break # # ============================================================================= # maxi=li[0][1] # for i in range(1,len(li)): # if li[i][1]-maxi>=0.005: # maxi=li[i][1] # for i in li: # if i[1]==maxi: # true_k=i[0] # # ============================================================================= # # maxi=max([i[1] for i in li]) # # for i in li: # # if i[1]==maxi: # # true_k=i[0] # # ============================================================================= # fromsel.append(li[true_k-2]) # print(true_k) # knum.append(true_k) # kval=(max(set(knum), key=knum.count)) # print(kval) # ============================================================================= clusterer = SpectralClusterer(min_clusters=2, max_clusters=100, p_percentile=0.95, gaussian_blur_sigma=1) predicted_label = clusterer.predict(feats) # ============================================================================= # clusters = KMeans(n_clusters=40, init='k-means++', max_iter=100, n_init=1, random_state = 0) # clusters.fit(feats) # tsne = TSNEVisualizer() # tsne.fit(feats, ["c{}".format(c) for c in clusters.labels_]) # tsne.poof() # ============================================================================= global no_speakers no_speakers = len(set(predicted_label)) #print(predicted_label,'**************************') time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show()
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e file1 = open("myfile.txt", "w+") for spk, timeDicts in speakerSlice.items(): file1.write("person" + "\n") for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) filestart = s + " -- " fileend = e + "\n" file1.write(filestart) file1.write(fileend) file1.close()
def test_four_clusters(self): """Four clusters on vertices of a square.""" label_to_center = { 'A': np.array([0.0, 0.0]), 'B': np.array([0.0, 1.0]), 'C': np.array([1.0, 0.0]), 'D': np.array([1.0, 1.0]), } # generate training data train_cluster_id = ['A'] * 400 + ['B'] * 300 + ['C'] * 200 + ['D' ] * 100 random.shuffle(train_cluster_id) train_sequence = _generate_random_sequence(train_cluster_id, label_to_center, sigma=0.01) train_sequences = [ train_sequence[:100, :], train_sequence[100:300, :], train_sequence[300:600, :], train_sequence[600:, :] ] train_cluster_ids = [ train_cluster_id[:100], train_cluster_id[100:300], train_cluster_id[300:600], train_cluster_id[600:] ] # generate testing data test_cluster_id = ['A'] * 10 + ['B'] * 20 + ['C'] * 30 + ['D'] * 40 random.shuffle(test_cluster_id) test_sequence = _generate_random_sequence(test_cluster_id, label_to_center, sigma=0.01) # construct model model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.enable_cuda = True #for prince model_args.rnn_depth = 2 model_args.rnn_hidden_size = 8 model_args.observation_dim = 2 model_args.verbosity = 3 training_args.learning_rate = 0.01 training_args.train_iteration = 200 training_args.enforce_cluster_id_uniqueness = False inference_args.test_iteration = 2 model = uisrnn.UISRNN(model_args) verbose = True if verbose: print("Training prints") print('TYPES(seq, id):', type(train_sequences), type(train_cluster_ids)) print('emb shape:', np.shape(train_sequences)) print('label shape:', np.shape(train_sequences[0])) print('flat label:', np.shape(train_cluster_ids[0])) print('*' * 10, '\n\n') # run training, and save the model model.fit(train_sequences, train_cluster_ids, training_args) temp_file_path = tempfile.mktemp() model.save(temp_file_path) # run testing predicted_label = model.predict(test_sequence, inference_args) if verbose: print("Prediction prints") print(type(predicted_label)) #print(len(predicted_label)) print('*' * 10, '\n\n') # run evaluation model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy) # load new model loaded_model = uisrnn.UISRNN(model_args) loaded_model.load(temp_file_path) # run testing with loaded model predicted_label = loaded_model.predict(test_sequence, inference_args) # run evaluation with loaded model model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy) # keep training from loaded model on a subset of training data transition_bias_1 = model.transition_bias training_args.learning_rate = 0.001 training_args.train_iteration = 50 model.fit(train_sequence[:100, :], train_cluster_id[:100], training_args) transition_bias_2 = model.transition_bias self.assertNotAlmostEqual(transition_bias_1, transition_bias_2) model.logger.print( 3, 'Asserting transition_bias changed from {} to {}'.format( transition_bias_1, transition_bias_2)) # run evaluation model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy)
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): start_time = time.clock() # gpu configuration session = toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to original wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if s != 0 and e != 0: break if s == 0 and key > timeDict['start']: offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if e == 0 and key > timeDict['stop']: offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) end_time = time.clock() print("Total Time: %.2f s." % (end_time - start_time)) p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show()
def test_four_clusters(self): """Four clusters on vertices of a square.""" label_to_center = { 'A': np.array([0.0, 0.0]), 'B': np.array([0.0, 1.0]), 'C': np.array([1.0, 0.0]), 'D': np.array([1.0, 1.0]), } # generate training data train_cluster_id = ['A'] * 400 + ['B'] * 300 + ['C'] * 200 + ['D' ] * 100 random.shuffle(train_cluster_id) train_sequence = _generate_random_sequence(train_cluster_id, label_to_center, sigma=0.01) # generate testing data test_cluster_id = ['A'] * 10 + ['B'] * 20 + ['C'] * 30 + ['D'] * 40 random.shuffle(test_cluster_id) test_sequence = _generate_random_sequence(test_cluster_id, label_to_center, sigma=0.01) # construct model model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.rnn_depth = 2 model_args.rnn_hidden_size = 8 model_args.observation_dim = 2 model_args.verbosity = 3 training_args.learning_rate = 0.01 training_args.learning_rate_half_life = 50 training_args.train_iteration = 200 inference_args.test_iteration = 2 model = uisrnn.UISRNN(model_args) # run training, and save the model model.fit(train_sequence, train_cluster_id, training_args) temp_file_path = tempfile.mktemp() model.save(temp_file_path) # run testing predicted_label = model.predict(test_sequence, inference_args) # run evaluation model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy) # load new model loaded_model = uisrnn.UISRNN(model_args) loaded_model.load(temp_file_path) # run testing with loaded model predicted_label = loaded_model.predict(test_sequence, inference_args) # run evaluation with loaded model model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy) # keep training from loaded model on a subset of training data transition_bias_1 = model.transition_bias training_args.learning_rate = 0.001 training_args.train_iteration = 50 model.fit(train_sequence[:100, :], train_cluster_id[:100], training_args) transition_bias_2 = model.transition_bias self.assertNotAlmostEqual(transition_bias_1, transition_bias_2) model.logger.print( 3, 'Asserting transition_bias changed from {} to {}'.format( transition_bias_1, transition_bias_2)) # run evaluation model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy)
def main(wav_path, check, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) if check != '': specs1, interval1 = load_data(check, embedding_per_second=1.2, overlap_rate=0.4) mapTable1, keys1 = genMap(interval1) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] featss = np.array(feats)[:, 0, :].astype(float) predicted_label = uisrnnModel.predict(featss, inference_args) total_speaker = len(set(predicted_label)) global no_speakers print("predicted_label: %s" % predicted_label) no_speakers = len(set(predicted_label)) print('total no of speakers', no_speakers) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms if check != '': for spec1 in specs1: spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1) v = network_eval.predict(spec1) feats += [v] featss = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] print("=====================") print(feats) print(featss) print("=====================") predicted_label2 = uisrnnModel.predict(featss, inference_args) check_speaker = len(set(predicted_label2)) print("predicted_label2: %s" % predicted_label2) print('same Speaker' if total_speaker == check_speaker else 'not the same speaker') print('speaker detected as ' + str(predicted_label2[-1]) if total_speaker == check_speaker else '') speakerSlice2 = arrangeResult(predicted_label2, time_spec_rate) print("=============speakerSlice2===============") for spk, timeDicts in speakerSlice2.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice2[spk][tid]['start'] = s speakerSlice2[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice2.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) print("=============speakerSlice2===============") #print(predicted_label,'**************************') center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e)
def main(): """The main function.""" model_args, training_args, inference_args = uisrnn.parse_arguments() diarization_experiment(model_args, training_args, inference_args)
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e # for spk,timeDicts in speakerSlice.items(): # print('========= ' + str(spk) + ' =========') # for timeDict in timeDicts: # s = timeDict['start'] # e = timeDict['stop'] # s = fmtTime(s) # change point moves to the center of the slice # e = fmtTime(e) # print(s+' ==> '+e) # p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) # p.draw() # p.plot.show() speech_r = speech_reg.Recognizer() sound = AudioSegment.from_wav(wav_path) for spk in speakerSlice.keys(): print('========= ' + str(spk) + ' =========') for item_dict in speakerSlice[spk]: audio_seg = sound[item_dict['start']:item_dict['stop']] s = item_dict['start'] e = item_dict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) item_dict.update({'content': audio_seg}) filename = 'speaker' + str(spk) + '-' + str( item_dict['start'] / 1000) + '-' + str( item_dict['stop'] / 1000) + '.wav' audio_seg.export(filename, format="wav") audio = speech_reg.AudioFile(filename) # words=speech_reg.AudioData(audio_seg,sample_rate=fs,sample_width=2) with audio as source: words = speech_r.record(source) try: res = speech_r.recognize_google(words) except speech_reg.UnknownValueError: try: res = speech_r.recognize_sphinx(words) except speech_reg.UnknownValueError: res = '' item_dict.update({'content': res}) print(res) return speakerSlice
def main(wav_path, embedding_per_second=1.0, n_classes=5994, overlap_rate=0.5, plot_results=True): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) # model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) print('intervals', intervals, len(intervals)) print('mapTable', mapTable, len(mapTable)) print('keys', keys, len(keys)) # print('mapTable, keys', mapTable, keys) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) # print('v',v.shape) #print('feats', feats.shape) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) print(feats.shape) print(inference_args) print('predicted_label', predicted_label) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms print('time_spec_rate', time_spec_rate) center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) print('speakerSlice', speakerSlice) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) print(spk, timeDicts) for tid, timeDict in enumerate(timeDicts): print(tid, timeDict) s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] print('offset', offset) s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset print('i,s,e') print(i, s, e, tid, spk) print('>>>>>', i, s, e, tid, spk) speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e speaker_assingments = [] for spk, timeDicts in speakerSlice.items(): speaker = str(spk) print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: start = timeDict['start'] end = timeDict['stop'] start = fmtTime( start) # change point moves to the center of the slice end = fmtTime(end) print(start + ' ==> ' + end) speaker_assingments.append((start, end, speaker, wav_path)) if plot_results: p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show() return feats, predicted_label, intervals, speaker_assingments, time_spec_rate
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5, exportFile=None, expectedSpeakers=2): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e n_speakers = len(speakerSlice) print('N-SPeakers:', n_speakers) global speaker_final speaker_final = [pdb.empty()] * n_speakers for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] diarization_try(wav_path, s / 1000, e / 1000, spk) s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) # Find the Top n Speakers speaker_final.sort(key=lambda speaker: speaker.duration_seconds, reverse=True) speaker_final = speaker_final[0:expectedSpeakers] # Export the Files iso_wav_path = wav_path.split(".")[0] itr = 0 while itr < len(speaker_final): write_path = exportFile + "_speaker" + str(itr) + ".wav" speaker_final[itr].export(write_path, format="wav") itr += 1 del speaker_final
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5, retain_audio_clip=False): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] get_transcript(str(spk), s, e) result = print_transcipt() try: for item in result: start = fmtTime(item[1]) end = fmtTime(item[2]) file = open(os.path.join(dir_name, 'FinalTranscript.txt'), 'a') transcription = f"{start} ==> {end}: [Speaker : {item[0]}] : {item[3]}" print(transcription) file.write(transcription) except Exception as exp: print(f"Failed in main() while writing to file with exception {exp}") finally: file.close() if not retain_audio_clip: shutil.rmtree(dir_name) else: print( f'Audio files of transcriptions can be found in {dir_name} folder') p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show() return result