def speakerDiarizationWrapper(inputFile, numSpeakers, useLDA): if useLDA: aS.speaker_diarization(inputFile, numSpeakers, plot_res=True) else: aS.speaker_diarization(inputFile, numSpeakers, lda_dim=0, plot_res=True)
def diarize_sentences(self): if self.num_of_speakers != 1: from pyAudioAnalysis.audioSegmentation import speaker_diarization input_file = os.path.join( self.full_path, (DEFAULT_STT_INPUT_FILENAME + DEFAULT_STT_INPUT_FORMAT)) output_file = os.path.join(REPO_PATH, PY_AUDIO_ANALYSIS_DATA_DIRECTORY, (DEFAULT_STT_INPUT_FILENAME + ".wav")) command = "ffmpeg -i " + input_file + " " + output_file os.system(command) input( "\a\nFFmpeg audio conversion for diarization complete. Press <enter> to continue" ) diarized_speakers = speaker_diarization(output_file, self.num_of_speakers) input("\a\nDiarization complete. Press <enter> to continue") # list of speakers... example -> [0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0] diarized_speakers.tolist() # converts from numpy ndarray # the number of speakers in the audio/video (this could have been dynamically determined) self.num_of_speakers = int(max(diarized_speakers) + 1) # sometimes the ML algo doesn't mark the original speaker as 0... this corrects that # by moving every speaker marker down by one (zeros circle back to the top) if int(diarized_speakers[0]) != 0: for i in range(len(diarized_speakers)): if diarized_speakers[int(i)] > 0: diarized_speakers[int(i)] -= 1 elif diarized_speakers[int(i)] == 0: diarized_speakers[int(i)] = self.num_of_speakers - 1 # parallel list to diarized_speakers; the time of each element in diarized_speakers speaker_timing = [ round((i * 0.2 + 0.1), 1) for i in range(len(diarized_speakers)) ] print(f"\a\n{self.num_of_speakers} unique speakers detected:") for i in range(self.num_of_speakers): self.speaker_genders[(i + 1)] = input( f"\tSpeaker #{(i+1)} gender (male=1, female=2, unknown=3): " ) for sentence in self.sentences: # finds sub list of valid speakers inside time interval of sentence start = sentence.start_time end = sentence.end_time valid_times = [ time for time in speaker_timing if time >= start if time <= end ] valid_speakers = [ diarized_speakers[speaker_timing.index(time)] for time in valid_times ] # sets sentence's speaker and gender based on the most prevelant speaker in the interval sentence.speaker = int( max(set(valid_speakers), key=valid_speakers.count)) + 1 sentence.gender = int(self.speaker_genders[sentence.speaker])
def pipeline(args): start_time = time.time() args.filename, to_delete = convert(args.filename) t = time.time() print("diarization...") diarization = np.array( speaker_diarization(args.filename, n_speakers=args.n_speakers, mid_step=args.chunk_size, short_window=args.short_window, mid_window=args.mid_window, lda_dim=args.lda_dim)).astype("int") diarization = make_diarization_chronological(diarization) print("Done !") print("Took {} seconds".format(int(time.time() - t))) print("number of chunk : {}".format(len(diarization))) print("Found {} speakers".format(len(np.unique(diarization)))) print("building segments...") segment_list, speaker_list = sound_to_segments(diarization, args.chunk_size) if args.show_speakers: print("showing speakers...") show_speakers(args.filename, segment_list, speaker_list) print("choose speeds for each speakers (separated by space)") args.speeds = np.array(input().split(" ")).astype(float) if args.auto: print("Automatically finding speakers speeds...") speeds = find_speaker_speeds(args.filename, segment_list, speaker_list, max_length=60, min_length=350) print("Speakers speeds: (syllab / minutes)") print(list(60 * np.array(speeds))) # convert to syllab / minutes args.speeds = [max(speeds) / speed for speed in speeds] print("Going to speed up the speakers by :") print(args.speeds) print("speeding up...") speed_up(segment_list, speaker_list, args.speeds, args.filename, output_file="temp_folder/audio_speedup.wav") print("adding intro...") add_intro("temp_folder/audio_speedup.wav", args.save_file) if to_delete: os.remove(args.filename) print("Done in {} seconds! Saved the result to {}".format( int(time.time() - start_time), args.save_file)) return args.speeds
while (time.process_time() - time_start < 60): print("Checking...") try: message = pop_message(client, queue_url) print(message) if message == "Listening": myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1) #num channels sd.wait() # Wait until recording is finished write('output.wav', fs, myrecording) # Save as WAV file mood = aT.file_regression("output.wav", [ "data/svmSpeechEmotion_valence", "data/svmSpeechEmotion_valenceMEANS", "data/svmSpeechEmotion_arousal", "data/svmSpeechEmotion_arousalMEANS" ], "svm") people = audioSegmentation.speaker_diarization("output.wav", 0, mid_window=2.0, mid_step=0.2, short_window=0.05, lda_dim=35, plot_res=False) result = GenrePicker(mood[0][0], mood[0][1], people) print(result) elif message == "Responding": print(result) post_message(client, result, queue_url) except: pass
def main(argv): if argv[1] == "-shortTerm": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.time() F = MidTermFeatures.short_term_feature_extraction( x, Fs, 0.050 * Fs, 0.050 * Fs) t2 = time.time() perTime1 = duration / (t2 - t1) print "short-term feature extraction: {0:.1f} x realtime".format( perTime1) elif argv[1] == "-classifyFile": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.time() aT.file_classification("diarizationExample.wav", "svmSM", "svm") t2 = time.time() perTime1 = duration / (t2 - t1) print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-mtClassify": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.time() [flagsInd, classesAll, acc] = aS.mid_term_file_classification("diarizationExample.wav", "svmSM", "svm", False, '') t2 = time.time() perTime1 = duration / (t2 - t1) print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-hmmSegmentation": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.time() aS.hmm_segmentation('diarizationExample.wav', 'hmmRadioSM', False, '') t2 = time.time() perTime1 = duration / (t2 - t1) print "HMM-based classification - segmentation \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-silenceRemoval": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.time() [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") segments = aS.silence_removal(x, Fs, 0.050, 0.050, smooth_window=1.0, Weight=0.3, plot=False) t2 = time.time() perTime1 = duration / (t2 - t1) print "Silence removal \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-thumbnailing": for i in range(nExp): [Fs1, x1] = audioBasicIO.read_audio_file("scottish.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.time() [A1, A2, B1, B2, Smatrix] = aS.music_thumbnailing(x1, Fs1, 1.0, 1.0, 15.0) # find thumbnail endpoints t2 = time.time() perTime1 = duration1 / (t2 - t1) print "Thumbnail \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-noLDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.time() aS.speaker_diarization("diarizationExample.wav", 4, LDAdim=0, PLOT=False) t2 = time.time() perTime1 = duration1 / (t2 - t1) print "Diarization \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-LDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.time() aS.speaker_diarization("diarizationExample.wav", 4, PLOT=False) t2 = time.time() perTime1 = duration1 / (t2 - t1) print "Diarization \t {0:.1f} x realtime".format(perTime1)
tempAudioData = np.empty_like(data) tempAudioData[:] = data step_size = 0.1 frameset_to_take = int(framerate * step_size) ctr = np.zeros(10) amp_sum = np.zeros(10) amp_cnt = np.zeros(10) amp_mean = np.zeros(10) counter = 0 try: # Segmentation of Audio file based on amplitude of voice using supervised clustering temp = aseg.speaker_diarization("c.wav", 2, mid_window=0.1, mid_step=0.1, short_window=0.1, lda_dim=0, plot_res=False) # Finding mean amplitude of each identified speaker for k in range(temp.size): for l in range(frameset_to_take): if counter < data.size: if data.ndim == 2: amp_sum[int(temp[k])] += abs(data[int(counter), 0]) else: amp_sum[int(temp[k])] += abs(data[int(counter)]) amp_cnt[int(temp[k])] += 1 counter += 1 for m in range(10): if amp_cnt[m] != 0:
def test_speaker_diarization(): labels, purity_cluster_m, purity_speaker_m = \ aS.speaker_diarization("test_data/diarizationExample.wav", 4, plot_res=False) assert purity_cluster_m > 0.9, "Diarization cluster purity is low" assert purity_speaker_m > 0.9, "Diarization speaker purity is low"
def pyAudioDiar(): duration, result = aS.speaker_diarization(labelFileNameSound.get(), int(labelNumberOfSpeakers.get()), lda_dim=0, plot_res=False) show = 'diarizationExample' input_show = labelFileNameSound.get() input_sad = None win_size = 250 thr_l = 2 thr_h = 3 thr_vit = -250 wdir = os.path.join('out', show) if not os.path.exists(wdir): os.makedirs(wdir) fs = get_feature_server(input_show, feature_server_type='basic') cep, _ = fs.load(show) cep.shape if input_sad is not None: init_diar = Diar.read_seg(input_sad) init_diar.pack(50) else: init_diar = segmentation.init_seg(cep, show) seg_diar = segmentation.segmentation(cep, init_diar, win_size) bicl_diar = segmentation.bic_linear(cep, seg_diar, thr_l, sr=False) bic = hac_bic.HAC_BIC(cep, bicl_diar, thr_h, sr=False) bich_diar = bic.perform(to_the_end=True) vit_diar = viterbi.viterbi_decoding(cep, bich_diar, thr_vit) resList = [] currentPosition = 0 for row in vit_diar: speakerValue = int(row[1][1:]) while currentPosition < (row[3] + row[4]): resList.append(speakerValue) currentPosition += 20 currentPosition = 0 realityList = [] realityFile = pd.read_csv(labelFileNameSegment.get(), delimiter='\t', encoding='utf-8', names=['start', 'end', 'speaker']) for index, row in realityFile.iterrows(): speakerValue = int(row['speaker'][1:]) while currentPosition < row['end']: realityList.append(int(speakerValue)) currentPosition += 0.2 plot.subplot(3, 1, 2) plot.title("s4d:") plot.plot(np.arange(0, duration, duration / len(resList)), resList, 'ro') plot.subplot(3, 1, 1) plot.title("Реальность:") plot.plot(np.arange(0, duration, duration / len(realityList)), realityList, 'bo') plot.subplot(3, 1, 3) plot.title("pyPlotAudio:") plot.plot(np.arange(0, duration, duration / len(result)), result, 'go') plot.show()