def test_fit_list_and_predict_single_label(self): """Train and test model while training data has single label. Training data are not concatenated. """ model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.enable_cuda = False model_args.rnn_depth = 1 model_args.rnn_hidden_size = 8 model_args.observation_dim = 16 training_args.learning_rate = 0.01 training_args.train_iteration = 50 inference_args.test_iteration = 1 # generate fake training data, as a list train_sequences = [ np.random.rand(100, model_args.observation_dim), np.random.rand(200, model_args.observation_dim), np.random.rand(300, model_args.observation_dim)] train_cluster_ids = [ np.array(['A'] * 100), np.array(['A'] * 200), np.array(['A'] * 300),] model = uisrnn.UISRNN(model_args) # training model.fit(train_sequences, train_cluster_ids, training_args) # testing, where data has less variation than training test_sequence = np.random.rand(10, model_args.observation_dim) / 10.0 predicted_label = model.predict(test_sequence, inference_args) self.assertListEqual([0] * 10, predicted_label)
def test_fit_concatenated_and_predict_single_label(self): """Train and test model while training data has single label. Training data have already been concatenated. """ model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.rnn_depth = 1 model_args.rnn_hidden_size = 8 model_args.observation_dim = 16 training_args.learning_rate = 0.01 training_args.train_iteration = 50 inference_args.test_iteration = 1 # generate fake training data, assume already concatenated train_sequence = np.random.rand(1000, model_args.observation_dim) train_cluster_id = np.array(['A'] * 1000) model = uisrnn.UISRNN(model_args) # training model.fit(train_sequence, train_cluster_id, training_args) # testing, where data has less variation than training test_sequence = np.random.rand(10, model_args.observation_dim) / 10.0 predicted_label = model.predict(test_sequence, inference_args) self.assertListEqual([0] * 10, predicted_label)
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_labels = [] test_record = [] train_data = np.load('./ghostvlad/training_data1.npz') train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] train_sequence_list = [ seq.astype(float) + 0.00001 for seq in train_sequence ] train_cluster_id_list = [ np.array(cid).astype(str) for cid in train_cluster_id ] model = uisrnn.UISRNN(model_args) #model.load(SAVED_MODEL_NAME) # training model.fit(train_sequence_list, train_cluster_id_list, training_args) model.save(SAVED_MODEL_NAME + str(2)) '''
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_labels = [] test_record = [] train_data = np.load('./training_data.npz') test_data = np.load('./data/testing_data.npz') train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] test_sequences = test_data['test_sequences'] test_cluster_ids = test_data['test_cluster_ids'] model = uisrnn.UISRNN(model_args) print("train_sequence = {}".format(train_sequence.shape)) print("train_cluster_id = {}".format(train_cluster_id.shape)) print(train_cluster_id[:1000]) '''
def test_save_and_load(self): """Save model and load it.""" model_args, _, _ = uisrnn.parse_arguments() model_args.observation_dim = 16 model_args.transition_bias = 0.5 model_args.sigma2 = 0.05 model = uisrnn.UISRNN(model_args) temp_file_path = tempfile.mktemp() model.save(temp_file_path) model.load(temp_file_path) self.assertEqual(0.5, model.transition_bias)
def diarization_experiment(model_args, training_args, inference_args, isLoaded=True): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_labels = [] test_record = [] train_data = np.load('./ghostvlad/training_data_100.npz', allow_pickle=True) train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] train_sequence_list = [seq.astype(float) + 1e-5 for seq in train_sequence] train_cluster_id_list = [np.array(cid).astype(str) for cid in train_cluster_id] test_sequences = train_sequence_list[-2:-1] test_cluster_ids = [e.tolist() for e in train_cluster_id_list[-2:-1]] model = uisrnn.UISRNN(model_args) if not isLoaded: # training model.fit(train_sequence_list, train_cluster_id_list, training_args) model.save(SAVED_MODEL_NAME) else: # testing # we can also skip training by calling: model.load(SAVED_MODEL_NAME) for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids): predicted_label = model.predict(test_sequence, inference_args) predicted_labels.append(predicted_label) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_id, predicted_label) test_record.append((accuracy, len(test_cluster_id))) print('Ground truth labels:') print(test_cluster_id) print('Predicted labels:') print(predicted_label) print('-' * 80) output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string)
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_cluster_ids = [] test_record = [] train_data = np.load('./data/toy_training_data.npz', allow_pickle=True) test_data = np.load('./data/toy_testing_data.npz', allow_pickle=True) train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] test_sequences = test_data['test_sequences'].tolist() test_cluster_ids = test_data['test_cluster_ids'].tolist() model = uisrnn.UISRNN(model_args) # Training. # If we have saved a mode previously, we can also skip training by # calling: # model.load(SAVED_MODEL_NAME) model.fit(train_sequence, train_cluster_id, training_args) model.save(SAVED_MODEL_NAME) # Testing. # You can also try uisrnn.parallel_predict to speed up with GPU. # But that is a beta feature which is not thoroughly tested, so # proceed with caution. for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids): predicted_cluster_id = model.predict(test_sequence, inference_args) predicted_cluster_ids.append(predicted_cluster_id) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_id, predicted_cluster_id) test_record.append((accuracy, len(test_cluster_id))) print('Ground truth labels:') print(test_cluster_id) print('Predicted labels:') print(predicted_cluster_id) print('-' * 80) output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string)
def diarize(segments, sr=16000, win_len=400, hop_len=160, embedding_per_sec=1.0, overlap_rate=0.1): logger.debug("[Speaker diarization] Initializing models") # Initialize ghostvlad toolkits.initialize_GPU(Expando({"gpu": ""})) ghostvlad_model = model.vggvox_resnet2d_icassp(input_dim=(257, None, 1), num_class=5994, mode="eval", args=Expando({"net": "resnet34s", "loss": "softmax", "vlad_cluster": 8, "ghost_cluster": 2, "bottleneck_dim": 512, "aggregation_mode": "gvlad"})) ghostvlad_model.load_weights("ghostvlad/pretrained/weights.h5", by_name=True) # Initialize uisrnn sys.argv = sys.argv[:1] model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnn_model = uisrnn.UISRNN(model_args) uisrnn_model.load("uisrnn/pretrained/saved_model.uisrnn_benchmark") logger.debug("[Speaker diarization] Calculating utterance features") utterances_spec = prepare_ghostvlad_data(segments, sr, win_len, hop_len, embedding_per_sec, overlap_rate) feats = [] for spec in utterances_spec: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = ghostvlad_model.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) logger.debug("[Speaker diarization] Clustering utterance features") labels = uisrnn_model.predict(feats, inference_args) logger.debug("[Speaker diarization] Tagging segments speakers") embedding_duration = (1/embedding_per_sec) * (1.0 - overlap_rate) labels_count = len(labels) current = 0 for segment in segments: begin_index = math.floor(current/embedding_duration) current += segment.end-segment.begin end_index = math.ceil(current/embedding_duration) segment_labels = [labels[index] for index in range(begin_index, min(end_index, labels_count))] if len(segment_labels) > 0: segment.speaker = max(segment_labels, key=segment_labels.count) else: segment.speaker = 999 return segments
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_cluster_ids = [] test_record = [] train_data = np.load('./data/toy_training_data.npz') test_data = np.load('./data/toy_testing_data.npz') train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] test_sequences = test_data['test_sequences'].tolist() test_cluster_ids = test_data['test_cluster_ids'].tolist() model = uisrnn.UISRNN(model_args) # training model.fit(train_sequence, train_cluster_id, training_args) model.save(SAVED_MODEL_NAME) # we can also skip training by calling: # model.load(SAVED_MODEL_NAME) # testing for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids): predicted_cluster_id = model.predict(test_sequence, inference_args) predicted_cluster_ids.append(predicted_cluster_id) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_id, predicted_cluster_id) test_record.append((accuracy, len(test_cluster_id))) print('Ground truth labels:') print(test_cluster_id) print('Predicted labels:') print(predicted_cluster_id) print('-' * 80) output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string)
def test_fit_with_wrong_dim(self): """Training data has wrong dimension.""" model_args, training_args, _ = uisrnn.parse_arguments() model_args.rnn_depth = 1 model_args.rnn_hidden_size = 8 model_args.observation_dim = 16 training_args.learning_rate = 0.01 training_args.train_iteration = 5 # generate fake data train_sequence = np.random.rand(1000, 18) train_cluster_id = np.array(['A'] * 1000) model = uisrnn.UISRNN(model_args) # training with self.assertRaises(ValueError): model.fit(train_sequence, train_cluster_id, training_args)
def test_predict_with_wrong_dim(self): """Testing data has wrong dimension.""" model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.enable_cuda = False model_args.rnn_depth = 1 model_args.rnn_hidden_size = 8 model_args.observation_dim = 16 training_args.learning_rate = 0.01 training_args.train_iteration = 50 # generate fake data train_sequence = np.random.rand(1000, model_args.observation_dim) train_cluster_id = np.array(['A'] * 1000) model = uisrnn.UISRNN(model_args) # training model.fit(train_sequence, train_cluster_id, training_args) # testing test_sequence = np.random.rand(10, 18) with self.assertRaises(ValueError): model.predict(test_sequence, inference_args)
def process(wav_path, embedding_per_second=1.0, overlap_rate=0.5, after_shift=0, output_seg=False, show=False, segment_fn='output.seg', args=None): if args is None: args = Args() args.gpu = '' args.resume = os.path.join(BASE_DIR, 'ghostvlad/pretrained/weights.h5') args.data_path = '4persons' # set up network configuration. args.net = 'resnet34s' #, choices=['resnet34s', 'resnet34l'], type=str) args.ghost_cluster = 2 args.vlad_cluster = 8 args.bottleneck_dim = 512 args.aggregation_mode = 'gvlad' #, choices=['avg', 'vlad', 'gvlad'], type=str) # set up learning rate, training loss and optimizer. args.loss = 'softmax' #, choices=['softmax', 'amsoftmax'], type=str) args.test_type = 'normal' #, choices=['normal', 'hard', 'extend'], type=str) # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } t0 = time.time() network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args = Args() model_args.observation_dim = 512 model_args.rnn_hidden_size = 512 model_args.rnn_depth = 1 model_args.rnn_dropout = 0.2 model_args.transition_bias = None model_args.crp_alpha = 1.0 model_args.sigma2 = None model_args.verbosity = 2 inference_args = Args() inference_args.beam_size = 10 inference_args.look_ahead = 1 inference_args.test_iteration = 2 # model_args, _, inference_args = uisrnn.parse_arguments() # model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) td = time.time() - t0 print('Load model time:', td) print('Loading data...') t0 = time.time() # specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) specs, intervals, feats = load_data( wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate, network_eval=network_eval) mapTable, keys = genMap(intervals) td = time.time() - t0 print('Load data time:', td) print('Generating feats...') t0 = time.time() # feats = [] # for spec in specs: # spec = np.expand_dims(np.expand_dims(spec, 0), -1) # v = network_eval.predict(spec) # feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] td = time.time() - t0 print('Load feat time:', td) print('inference_args:', inference_args) print('running uisrnn.predict...') t0 = time.time() predicted_label = uisrnnModel.predict(feats, inference_args) td = time.time() - t0 print('Load uisrnn.predict time:', td) t0 = time.time() time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) td = time.time() - t0 print('Load arrangeResult time:', td) t0 = time.time() for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e print('Load speakerSlicing time:', td) audacity_segments = [] for spk, timeDicts in speakerSlice.items(): for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = s * 1 / 1000. e = e * 1 / 1000. s += after_shift e += after_shift audacity_segments.append((s, e, spk)) if output_seg: with open(segment_fn, 'w') as fout: for s, e, l in audacity_segments: fout.write('%s\t%s\t%s\n' % (round(s, 6), round(e, 6), spk)) if show: p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show() return audacity_segments
# from matplotlib import cm # from time import sleep, perf_counter as timer # from umap import UMAP # import matplotlib.pyplot as plt sys.path.append("Resemblyzer") from resemblyzer import preprocess_wav, VoiceEncoder, sampling_rate # noqa # %% # Load file wav = preprocess_wav("Resemblyzer/audio_data/X2zqiX6yL3I.mp3") # %% # Audio features encoder = VoiceEncoder("cpu") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=5) # %% # Load UIS-RNN model sys.argv = ['dummy'] model_args, training_args, inference_args = uisrnn.parse_arguments() model = uisrnn.UISRNN(model_args) model.load('uis-rnn/saved_model.uisrnn') # %% # Testing test_sequence = cont_embeds.astype(float) predictions = model.predict(test_sequence, inference_args) # %%
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_labels = [] test_record = [] # edited by renni # handle ValueError allow pickle #train_data = np.load('./ghostvlad/training_data.npz') # save np.load np_load_old = np.load # modify the default parameters of np.load np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) # call load with allow_pickle implicitly set to true train_data = np.load('./ghostvlad/training_data.npz') # restore np.load for future normal usage np.load = np_load_old # end of renni train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] train_sequence_list = [ seq.astype(float) + 0.00001 for seq in train_sequence ] train_cluster_id_list = [ np.array(cid).astype(str) for cid in train_cluster_id ] model = uisrnn.UISRNN(model_args) # training #model.fit(train_sequence_list, train_cluster_id_list, training_args) #model.save(SAVED_MODEL_NAME) # testing # we can also skip training by calling: model.load(SAVED_MODEL_NAME) for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids): predicted_label = model.predict(test_sequence, inference_args) predicted_labels.append(predicted_label) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_id, predicted_label) test_record.append((accuracy, len(test_cluster_id))) print('Ground truth labels:') print(test_cluster_id) print('Predicted labels:') print(predicted_label) print('-' * 80) output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string)
def main(wav_path, check, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) if check != '': specs1, interval1 = load_data(check, embedding_per_second=1.2, overlap_rate=0.4) mapTable1, keys1 = genMap(interval1) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] featss = np.array(feats)[:, 0, :].astype(float) predicted_label = uisrnnModel.predict(featss, inference_args) total_speaker = len(set(predicted_label)) global no_speakers print("predicted_label: %s" % predicted_label) no_speakers = len(set(predicted_label)) print('total no of speakers', no_speakers) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms if check != '': for spec1 in specs1: spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1) v = network_eval.predict(spec1) feats += [v] featss = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] print("=====================") print(feats) print(featss) print("=====================") predicted_label2 = uisrnnModel.predict(featss, inference_args) check_speaker = len(set(predicted_label2)) print("predicted_label2: %s" % predicted_label2) print('same Speaker' if total_speaker == check_speaker else 'not the same speaker') print('speaker detected as ' + str(predicted_label2[-1]) if total_speaker == check_speaker else '') speakerSlice2 = arrangeResult(predicted_label2, time_spec_rate) print("=============speakerSlice2===============") for spk, timeDicts in speakerSlice2.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice2[spk][tid]['start'] = s speakerSlice2[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice2.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) print("=============speakerSlice2===============") #print(predicted_label,'**************************') center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e)
def test_four_clusters(self): """Four clusters on vertices of a square.""" label_to_center = { 'A': np.array([0.0, 0.0]), 'B': np.array([0.0, 1.0]), 'C': np.array([1.0, 0.0]), 'D': np.array([1.0, 1.0]), } # generate training data train_cluster_id = ['A'] * 400 + ['B'] * 300 + ['C'] * 200 + ['D' ] * 100 random.shuffle(train_cluster_id) train_sequence = _generate_random_sequence(train_cluster_id, label_to_center, sigma=0.01) train_sequences = [ train_sequence[:100, :], train_sequence[100:300, :], train_sequence[300:600, :], train_sequence[600:, :] ] train_cluster_ids = [ train_cluster_id[:100], train_cluster_id[100:300], train_cluster_id[300:600], train_cluster_id[600:] ] # generate testing data test_cluster_id = ['A'] * 10 + ['B'] * 20 + ['C'] * 30 + ['D'] * 40 random.shuffle(test_cluster_id) test_sequence = _generate_random_sequence(test_cluster_id, label_to_center, sigma=0.01) # construct model model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.enable_cuda = True #for prince model_args.rnn_depth = 2 model_args.rnn_hidden_size = 8 model_args.observation_dim = 2 model_args.verbosity = 3 training_args.learning_rate = 0.01 training_args.train_iteration = 200 training_args.enforce_cluster_id_uniqueness = False inference_args.test_iteration = 2 model = uisrnn.UISRNN(model_args) verbose = True if verbose: print("Training prints") print('TYPES(seq, id):', type(train_sequences), type(train_cluster_ids)) print('emb shape:', np.shape(train_sequences)) print('label shape:', np.shape(train_sequences[0])) print('flat label:', np.shape(train_cluster_ids[0])) print('*' * 10, '\n\n') # run training, and save the model model.fit(train_sequences, train_cluster_ids, training_args) temp_file_path = tempfile.mktemp() model.save(temp_file_path) # run testing predicted_label = model.predict(test_sequence, inference_args) if verbose: print("Prediction prints") print(type(predicted_label)) #print(len(predicted_label)) print('*' * 10, '\n\n') # run evaluation model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy) # load new model loaded_model = uisrnn.UISRNN(model_args) loaded_model.load(temp_file_path) # run testing with loaded model predicted_label = loaded_model.predict(test_sequence, inference_args) # run evaluation with loaded model model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy) # keep training from loaded model on a subset of training data transition_bias_1 = model.transition_bias training_args.learning_rate = 0.001 training_args.train_iteration = 50 model.fit(train_sequence[:100, :], train_cluster_id[:100], training_args) transition_bias_2 = model.transition_bias self.assertNotAlmostEqual(transition_bias_1, transition_bias_2) model.logger.print( 3, 'Asserting transition_bias changed from {} to {}'.format( transition_bias_1, transition_bias_2)) # run evaluation model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy)
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) #specs1,interval1 = load_data(r'wavs/REC20190716140159.wav', embedding_per_second=1.2, overlap_rate=0.4) #mapTable1,keys1 =genMap(interval1) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] # ============================================================================= # for spec1 in specs1: # spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1) # v = network_eval.predict(spec1) # feats += [v] # ============================================================================= feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] #print(len(feats),'00000000') #predicted_label = uisrnnModel.predict(feats, inference_args) #silhoutte score # ============================================================================= # sli=[] # fromsel=[] # li=[] # knum=[] # for i in range(10): # li=[] # range_n_clusters = list (range(2,5)) # for n_clusters in range_n_clusters: # clusterer = KMeans(n_clusters=n_clusters) # preds = clusterer.fit_predict(feats) # centers = clusterer.cluster_centers_ # # score = silhouette_score (feats, preds, metric='euclidean') # print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score)) # li.append([n_clusters,score,clusterer,centers]) # # ============================================================================= # # print([float(str(i[1])[:4]) for i in li]) # # kvalue=(max([float(str(i[1])[:4]) for i in li])) # # for i in range(len(li)): # # if kvalue==float(str(li[i][1])[:4]): # # true_k=li[i][0] # # break # # ============================================================================= # maxi=li[0][1] # for i in range(1,len(li)): # if li[i][1]-maxi>=0.005: # maxi=li[i][1] # for i in li: # if i[1]==maxi: # true_k=i[0] # # ============================================================================= # # maxi=max([i[1] for i in li]) # # for i in li: # # if i[1]==maxi: # # true_k=i[0] # # ============================================================================= # fromsel.append(li[true_k-2]) # print(true_k) # knum.append(true_k) # kval=(max(set(knum), key=knum.count)) # print(kval) # ============================================================================= clusterer = SpectralClusterer(min_clusters=2, max_clusters=100, p_percentile=0.95, gaussian_blur_sigma=1) predicted_label = clusterer.predict(feats) # ============================================================================= # clusters = KMeans(n_clusters=40, init='k-means++', max_iter=100, n_init=1, random_state = 0) # clusters.fit(feats) # tsne = TSNEVisualizer() # tsne.fit(feats, ["c{}".format(c) for c in clusters.labels_]) # tsne.poof() # ============================================================================= global no_speakers no_speakers = len(set(predicted_label)) #print(predicted_label,'**************************') time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show()
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration #toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.GVladModel(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v + 0.00001] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e saveAudacity(wav_path + ".txt", speakerSlice)
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ # data loading train_data = np.load('./data/toy_training_data.npz', allow_pickle=True) test_data = np.load('./data/toy_testing_data.npz', allow_pickle=True) train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] test_sequences = test_data['test_sequences'].tolist() test_cluster_ids = test_data['test_cluster_ids'].tolist() # model init model = uisrnn.UISRNN(model_args) # model.load(SAVED_MODEL_NAME) # to load a checkpoint # tensorboard writer init writer = SummaryWriter() # training for epoch in range(training_args.epochs): stats = model.fit(train_sequence, train_cluster_id, training_args) # add to tensorboard for loss, cur_iter in stats: for loss_name, loss_value in loss.items(): writer.add_scalar('loss/' + loss_name, loss_value, cur_iter) # save the mdoel model.save(SAVED_MODEL_NAME) # testing predicted_cluster_ids = [] test_record = [] # predict sequences in parallel model.rnn_model.share_memory() pool = mp.Pool(NUM_WORKERS, maxtasksperchild=None) pred_gen = pool.imap( func=partial(model.predict, args=inference_args), iterable=test_sequences) # collect and score predicitons for idx, predicted_cluster_id in enumerate(pred_gen): accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_ids[idx], predicted_cluster_id) predicted_cluster_ids.append(predicted_cluster_id) test_record.append((accuracy, len(test_cluster_ids[idx]))) print('Ground truth labels:') print(test_cluster_ids[idx]) print('Predicted labels:') print(predicted_cluster_id) print('-' * 80) # close multiprocessing pool pool.close() # close tensorboard writer writer.close() print('Finished diarization experiment') print(uisrnn.output_result(model_args, training_args, test_record))
def test_four_clusters(self): """Four clusters on vertices of a square.""" label_to_center = { 'A': np.array([0.0, 0.0]), 'B': np.array([0.0, 1.0]), 'C': np.array([1.0, 0.0]), 'D': np.array([1.0, 1.0]), } # generate training data train_cluster_id = ['A'] * 400 + ['B'] * 300 + ['C'] * 200 + ['D' ] * 100 random.shuffle(train_cluster_id) train_sequence = _generate_random_sequence(train_cluster_id, label_to_center, sigma=0.01) # generate testing data test_cluster_id = ['A'] * 10 + ['B'] * 20 + ['C'] * 30 + ['D'] * 40 random.shuffle(test_cluster_id) test_sequence = _generate_random_sequence(test_cluster_id, label_to_center, sigma=0.01) # construct model model_args, training_args, inference_args = uisrnn.parse_arguments() model_args.rnn_depth = 2 model_args.rnn_hidden_size = 8 model_args.observation_dim = 2 model_args.verbosity = 3 training_args.learning_rate = 0.01 training_args.learning_rate_half_life = 50 training_args.train_iteration = 200 inference_args.test_iteration = 2 model = uisrnn.UISRNN(model_args) # run training, and save the model model.fit(train_sequence, train_cluster_id, training_args) temp_file_path = tempfile.mktemp() model.save(temp_file_path) # run testing predicted_label = model.predict(test_sequence, inference_args) # run evaluation model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy) # load new model loaded_model = uisrnn.UISRNN(model_args) loaded_model.load(temp_file_path) # run testing with loaded model predicted_label = loaded_model.predict(test_sequence, inference_args) # run evaluation with loaded model model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy) # keep training from loaded model on a subset of training data transition_bias_1 = model.transition_bias training_args.learning_rate = 0.001 training_args.train_iteration = 50 model.fit(train_sequence[:100, :], train_cluster_id[:100], training_args) transition_bias_2 = model.transition_bias self.assertNotAlmostEqual(transition_bias_1, transition_bias_2) model.logger.print( 3, 'Asserting transition_bias changed from {} to {}'.format( transition_bias_1, transition_bias_2)) # run evaluation model.logger.print( 3, 'Asserting the equivalence between' '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id, predicted_label)) accuracy = uisrnn.compute_sequence_match_accuracy( predicted_label, test_cluster_id) self.assertEqual(1.0, accuracy)
def run_experiment(train_sequence, train_cluster_id, test_sequence, test_cluster_id, model_args, training_args, inference_args, exp_name): start = datetime.now() if training_args.debug: print('\n\n===== DEBUG MODE =====\n\n') def debug(m): if training_args.debug: print(m) # Create model class model = uisrnn.UISRNN(model_args) print('{} - Created {} model with {:,} params:'.format( datetime.now() - start, model.__class__.__name__, count_parameters(model.rnn_model))) print(model.rnn_model) # Training model_loc = os.path.join(training_args.out_dir, exp_name) model_constructed = (not training_args.overwrite) \ and os.path.exists(model_loc) if model_constructed: try: model.load(model_loc) print('{} - Loaded trained model from {}'.format( datetime.now() - start, model_loc, )) except Exception as e: print('Unable to load model from {}:\n{}'.format(model_loc, e)) model_constructed = False if not model_constructed: model.fit(train_sequence, train_cluster_id, training_args) print('{} - Trained model!'.format(datetime.now() - start)) model.save(model_loc) print('{} - Saved model to {}'.format(datetime.now() - start, model_loc)) # Testing predicted_cluster_ids = [] test_record = [] with torch.no_grad(): for i, (test_seq, test_cluster) in tqdm(enumerate( zip(test_sequence, test_cluster_id)), total=len(test_cluster_id)): debug('Test seq ({}) shape: {}'.format(test_seq.__class__.__name__, test_seq.shape)) debug('Test cluster ({}): {}'.format( test_cluster.__class__.__name__, test_cluster)) predicted_cluster_id = model.predict(test_seq, inference_args) debug('Predicted cluster ID: {}, class {}'.format( predicted_cluster_id, predicted_cluster_id.__class__.__name__)) predicted_cluster_ids.append(predicted_cluster_id) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster.tolist(), predicted_cluster_id) # We are getting accuracy per batch test_record.append((accuracy, len(test_cluster))) debug('Gold labels: {}'.format(list(test_cluster))) debug('Pred labels: {}'.format(list(predicted_cluster_id))) debug('-' * 80) # Output output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string) with open( os.path.join(training_args.out_dir, '{}_test.pkl'.format(exp_name)), 'wb') as wf: pickle.dump(test_record, wf) accuracy_array, _ = zip(*test_record) exp_accuracy = np.mean(accuracy_array) return exp_accuracy
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for_json = {} for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for_json[str(spk)] = [] for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] for_json[str(spk)] += [(s / 1000, e / 1000)] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) if args.out_path: print('about to try', for_json) with open(args.out_path, "w+", encoding='utf-8') as f: f.write(json.dumps(for_json))
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5, exportFile=None, expectedSpeakers=2): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e n_speakers = len(speakerSlice) print('N-SPeakers:', n_speakers) global speaker_final speaker_final = [pdb.empty()] * n_speakers for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] diarization_try(wav_path, s / 1000, e / 1000, spk) s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) # Find the Top n Speakers speaker_final.sort(key=lambda speaker: speaker.duration_seconds, reverse=True) speaker_final = speaker_final[0:expectedSpeakers] # Export the Files iso_wav_path = wav_path.split(".")[0] itr = 0 while itr < len(speaker_final): write_path = exportFile + "_speaker" + str(itr) + ".wav" speaker_final[itr].export(write_path, format="wav") itr += 1 del speaker_final
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_cluster_ids = [] test_record = [] # train_data = np.load('./data/toy_training_data.npz') # test_data = np.load('./data/toy_testing_data.npz') # train_sequence = train_data['train_sequence'] # train_cluster_id = train_data['train_cluster_id'] # test_sequences = test_data['test_sequences'].tolist() # test_cluster_ids = test_data['test_cluster_ids'].tolist() orig_train_sequences = np.load('data/train_sequence.npy').astype( np.float64) orig_train_cluster_ids = np.array(np.load('data/train_cluster_id.npy')) orig_test_sequences = np.load('data/test_sequence.npy').astype(np.float64) orig_test_cluster_ids = np.array(np.load('data/test_cluster_id.npy')) print(orig_test_sequences.shape) print(orig_test_cluster_ids.shape) orig_test_sequences = orig_test_sequences[:orig_test_sequences.shape[0] // 100] orig_test_cluster_ids = orig_test_cluster_ids[:orig_test_cluster_ids. shape[0] // 100] print(orig_test_sequences.shape) print(orig_test_cluster_ids.shape) test_chunk_size = orig_test_sequences.shape[0] // 86 test_left_over = orig_test_sequences.shape[0] % test_chunk_size test_new_len = orig_test_sequences.shape[0] - test_left_over test_sequences = np.split(orig_test_sequences[:test_new_len], test_chunk_size) test_cluster_ids = np.split(orig_test_cluster_ids[:test_new_len], test_chunk_size) model = uisrnn.UISRNN(model_args) # train_sequences = np.array(train_sequences) # train_cluster_ids = np.array(train_cluster_ids) # d = vars(training_args) # # training # for i in range(train_sequences.shape[0]): # train_sequence = train_sequences[i] # train_cluster_id = train_cluster_ids[i] # train_cluster_id = train_cluster_id.tolist() # d['learning_rate'] = 1e-3 # model.fit(train_sequence, train_cluster_id, training_args) # # Take care of leftovers # train_sequence = orig_train_sequences[train_new_len:] # train_cluster_id = orig_train_cluster_id[train_new_len:] # d['learning_rate'] = 1e-3 # model.fit(train_sequence, train_cluster_id, training_args) # model.save(SAVED_MODEL_NAME) # we can also skip training by calling: model.load(SAVED_MODEL_NAME) # testing # Take care of leftover # test_sequence = orig_test_sequences[test_new_len:] # test_cluster_id = orig_test_cluster_ids[test_new_len:].tolist() # predicted_cluster_id = model.predict(test_sequence, inference_args) # predicted_cluster_ids.append(predicted_cluster_id) # accuracy = uisrnn.compute_sequence_match_accuracy( # test_cluster_id, predicted_cluster_id) # test_record.append((accuracy, len(test_cluster_id))) # print('Ground truth labels:') # print(test_cluster_id) # print('Predicted labels:') # print(predicted_cluster_id) # print('-' * 80) # Then the rest for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids): #print(test_sequence.shape) #print(test_cluster_id) #assert 1 == 2 test_cluster_id = test_cluster_id.tolist() predicted_cluster_id = model.predict(test_sequence, inference_args) predicted_cluster_ids.append(predicted_cluster_id) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_id, predicted_cluster_id) test_record.append((accuracy, len(test_cluster_id))) print('Ground truth labels:') print(test_cluster_id) print('Predicted labels:') print(predicted_cluster_id) print('-' * 80) output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string)
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e # for spk,timeDicts in speakerSlice.items(): # print('========= ' + str(spk) + ' =========') # for timeDict in timeDicts: # s = timeDict['start'] # e = timeDict['stop'] # s = fmtTime(s) # change point moves to the center of the slice # e = fmtTime(e) # print(s+' ==> '+e) # p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) # p.draw() # p.plot.show() speech_r = speech_reg.Recognizer() sound = AudioSegment.from_wav(wav_path) for spk in speakerSlice.keys(): print('========= ' + str(spk) + ' =========') for item_dict in speakerSlice[spk]: audio_seg = sound[item_dict['start']:item_dict['stop']] s = item_dict['start'] e = item_dict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) item_dict.update({'content': audio_seg}) filename = 'speaker' + str(spk) + '-' + str( item_dict['start'] / 1000) + '-' + str( item_dict['stop'] / 1000) + '.wav' audio_seg.export(filename, format="wav") audio = speech_reg.AudioFile(filename) # words=speech_reg.AudioData(audio_seg,sample_rate=fs,sample_width=2) with audio as source: words = speech_r.record(source) try: res = speech_r.recognize_google(words) except speech_reg.UnknownValueError: try: res = speech_r.recognize_sphinx(words) except speech_reg.UnknownValueError: res = '' item_dict.update({'content': res}) print(res) return speakerSlice
def main(wav_path, embedding_per_second=1.0, n_classes=5994, overlap_rate=0.5, plot_results=True): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) # model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) print('intervals', intervals, len(intervals)) print('mapTable', mapTable, len(mapTable)) print('keys', keys, len(keys)) # print('mapTable, keys', mapTable, keys) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) # print('v',v.shape) #print('feats', feats.shape) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) print(feats.shape) print(inference_args) print('predicted_label', predicted_label) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms print('time_spec_rate', time_spec_rate) center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) print('speakerSlice', speakerSlice) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) print(spk, timeDicts) for tid, timeDict in enumerate(timeDicts): print(tid, timeDict) s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] print('offset', offset) s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset print('i,s,e') print(i, s, e, tid, spk) print('>>>>>', i, s, e, tid, spk) speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e speaker_assingments = [] for spk, timeDicts in speakerSlice.items(): speaker = str(spk) print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: start = timeDict['start'] end = timeDict['stop'] start = fmtTime( start) # change point moves to the center of the slice end = fmtTime(end) print(start + ' ==> ' + end) speaker_assingments.append((start, end, speaker, wav_path)) if plot_results: p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show() return feats, predicted_label, intervals, speaker_assingments, time_spec_rate
def dia_audio(wav_path, embedding_per_second=0.3, overlap_rate=0.33): # gpu configuration #toolkits.initialize_GPU(args) params = {'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data( wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000*(1.0/embedding_per_second) * \ (1.0-overlap_rate) # speaker embedding every ?ms center_duration = int(1000*(1.0/embedding_per_second)//2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items(): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if(s != 0 and e != 0): break if(s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i-1] s = mapTable[keys[i-1]] + offset if(e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i-1] e = mapTable[keys[i-1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): ##print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) #print(s+' ==> '+e) # p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) # p.draw() # p.plot.show() return speakerSlice
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5, retain_audio_clip=False): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] get_transcript(str(spk), s, e) result = print_transcipt() try: for item in result: start = fmtTime(item[1]) end = fmtTime(item[2]) file = open(os.path.join(dir_name, 'FinalTranscript.txt'), 'a') transcription = f"{start} ==> {end}: [Speaker : {item[0]}] : {item[3]}" print(transcription) file.write(transcription) except Exception as exp: print(f"Failed in main() while writing to file with exception {exp}") finally: file.close() if not retain_audio_clip: shutil.rmtree(dir_name) else: print( f'Audio files of transcriptions can be found in {dir_name} folder') p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show() return result