def speakerDiarizationWrapper(inputFile, numSpeakers, useLDA):
    if useLDA:
        #cls = aS.speakerDiarization(inputFile, numSpeakers, plot_res=True)
        cls = aS.speakerDiarization(inputFile, numSpeakers, plot_res=False)

        sound = AudioSegment.from_file(inputFile)
        print("type = ", type(sound))
        speaker_0 = AudioSegment.silent(1)
        speaker_1 = AudioSegment.silent(1)

        segs,flags = aS.flags2segs(cls, 0.2)    

        for s in range(segs.shape[0]):
            if(flags[s] == 0.0):
                speaker_0 = speaker_0 + sound[segs[s,0] : segs[s,1]+1]

            elif(flags[s] == 1.0):
                speaker_1 = speaker_1 + sound[segs[s,0] : segs[s,1]+1]
            print("{} {} {}\n".format(segs[s,0], segs[s,1], flags[s]))

        speaker_0.export("./ExportedData/Speaker_0.wav", format="wav")
        speaker_1.export("./ExportedData/Speaker_1.wav", format="wav")

    else:
        sound = AudioSegment.from_file(inputFile)
        speaker_0 = AudioSegment.silent(100)
        speaker_1 = AudioSegment.silent(100)
        #cls = aS.speakerDiarization(inputFile, numSpeakers, lda_dim=0, plot_res=True)
        cls = aS.speakerDiarization(inputFile, numSpeakers, lda_dim=0, plot_res=False)

        #print("type = ", type(sound))

        segs,flags = aS.flags2segs(cls, 0.2)    

        for s in range(segs.shape[0]):

            if(flags[s] == 0.0):
                #print("Inside 0")
                start = round(segs[s,0]*1000)
                end = round(segs[s,1]*1000 + 1)
                speaker_0 = speaker_0.append(sound[start : end] , crossfade = 100)

            elif(flags[s] == 1.0):
                #print("Inside 1")
                start = round(segs[s,0]*1000)
                end = round((segs[s,1])*1000 + 1)
                speaker_1 = speaker_1.append(sound[start : end],crossfade = 100)

# LINE TO PRINT THE STARTING AND ENDING TIMEINGS OF SEGMENTS OF DIFFERENT SPEAKERS
            #print("{} {} {}\n".format(segs[s,0], segs[s,1], flags[s]))
        
        l = inputFile
        arr = l.split('/')              # To make path split by "/"
        name = arr[-1].split('.')[0]    # To get the file name which'll be last element in "arr" list          
        #print(name)                    # and separate it from it's extension using "split('.')"
                                        

        speaker_0.export("./ExportedData/"+name+"_0.wav", format="wav")
        speaker_1.export("./ExportedData/"+name+"_1.wav", format="wav")
Example #2
0
def split_call_into_speakers(in_loc, out_loc):
    #   this function split all audio files in a directory into segments by speaker turns using pyAudioAnalysis library
    #
    #   in_loc: directory that contains all audio files
    #   out_loc: directory that stores all diarized segments

    for audio in os.listdir(in_loc):
        if audio != '.DS_Store':
            p = os.path.join(in_loc, audio)
            no_rings_audio = AudioSegment.from_file(p, format='wav')
            basename = os.path.splitext(os.path.basename(audio))[0]
            # split on speakers now setting num speakers to 2
            diarized = aS.speakerDiarization(p, 2, mtSize=0.5, mtStep=0.1)
            # determine which label was given to customer and salesperson
            cust = diarized[0]
            # output the segments
            segs, flags = aS.flags2segs(diarized, 0.1)  #mtstep from above
            for seg in range(segs.shape[0]):
                # skip segments shorter than 1s (usually 'um' or something)
                if segs[seg, 1] - segs[seg, 0] < 1:
                    continue
                out_seg = no_rings_audio[segs[seg, 0] * 1000:segs[seg, 1] *
                                         1000]
                if flags[seg] == cust:
                    out_seg.export(out_loc + basename + '.' + str(seg) +
                                   '_cust.wav',
                                   format='wav')
                else:
                    out_seg.export(out_loc + basename + '.' + str(seg) +
                                   '_sales.wav',
                                   format='wav')
def split_call_into_speakers(call_file, out_loc):
    '''
    Attempts to split a call file into different segments each time the speaker changes using
    speaker diarization. This method assumes there are two speakers in the file (sales and customer)
    and will cut out dial tones and any receptionists before the two speakers' conversation.
    '''
    # set output directories
    no_rings_out_dir = os.path.join(out_loc, 'calls_no_ringtones')
    if not os.path.exists(no_rings_out_dir):
        os.makedirs(no_rings_out_dir)
    diarized_out_dir = os.path.join(out_loc, 'calls_split_by_speaker')
    if not os.path.exists(diarized_out_dir):
        os.makedirs(diarized_out_dir)

    # load in raw audio file
    print(call_file)
    raw_audio = AudioSegment.from_file(call_file, 'wav')
    file_name = os.path.splitext(os.path.basename(call_file))[0]

    # uses trained HMM to determine where the ringtones are and only use audio from after
    # last detected ring and exports intermediate file
    curr_path = os.path.dirname(os.path.realpath(__file__))
    ring_labels = aS.hmmSegmentation(call_file,
                                     os.path.join(curr_path, 'hmmRingDetect'),
                                     False)
    segs, flags = aS.flags2segs(
        ring_labels[0],
        1.0)  # 1.0 is the mid-term window step from above model
    no_rings_audio = raw_audio[segs[-1, 0] * 1000:segs[-1, 1] * 1000]
    temp_out_loc = os.path.join(no_rings_out_dir, file_name) + '.wav'
    no_rings_audio.export(temp_out_loc, format='wav')

    # split on speakers now setting num speakers to 2
    diarized = aS.speakerDiarization(temp_out_loc, 2, mtSize=0.5, mtStep=0.1)

    # determine which label was given to customer and salesperson
    cust = diarized[0]

    # output the segments
    no_rings_audio = AudioSegment.from_file(
        temp_out_loc, format='wav')  # update segment so indexing is right
    segs, flags = aS.flags2segs(diarized, 0.1)  #mtstep from above
    curr_call_out_base = os.path.join(diarized_out_dir, file_name)
    if not os.path.exists(curr_call_out_base):
        os.makedirs(curr_call_out_base)
    for seg in range(segs.shape[0]):
        # skip segments shorter than 1s (usually 'um' or something)
        if segs[seg, 1] - segs[seg, 0] < 1:
            continue
        out_seg = no_rings_audio[segs[seg, 0] * 1000:segs[seg, 1] * 1000]
        if flags[seg] == cust:
            out_seg.export(os.path.join(curr_call_out_base,
                                        str(seg) + '_cust.wav'),
                           format='wav')
        else:
            out_seg.export(os.path.join(curr_call_out_base,
                                        str(seg) + '_sales.wav'),
                           format='wav')
Example #4
0
def split_by_speaker(filename, config):
    ''' Splits a file of audio into segments identified by speaker

		Parameters:
			filename
				string, the name of the audio file
			config
				CAT.settings.Config - all settings. associated with the program

		Returns:
			{
				speaker_id: list of windows of audio data (list of byte strings)
			},
			list of multi-dimensional means of the normal PDF associated with each speaker,
			list of covariance matrices of the normal PDF associated with each speaker
	'''

    # LDA is disabled so that all speakers are analyzed in the same space
    # and all clusters across all speaker identifications are roughly
    # Gaussian in that space
    speaker_detected_by_window, speaker_means, speaker_covariances = audioSegmentation.speakerDiarization(
        filename, config.get("max_speakers"), lda_dim=0)

    # calculate necessary stats on labelled windows
    WINDOW_LENGTH = .2  # in seconds
    LENGTH_OF_WINDOW_IN_FRAMES = int(config.get("rate") * WINDOW_LENGTH)
    LENGTH_OF_WINDOW_IN_BYTES = LENGTH_OF_WINDOW_IN_FRAMES * config.get(
        "num_channels") * config.get("num_bytes")

    # open file
    audio = utilities.read_file(filename)

    # split file into multiple segments based on speaker and sort by speaker
    segments_by_speaker = collections.defaultdict(list)
    previous_speaker = None
    for window_index in range(len(speaker_detected_by_window)):
        previous_speaker = speaker_detected_by_window[
            window_index - 1] if window_index > 0 else None
        speaker = int(speaker_detected_by_window[window_index])
        start_frame = LENGTH_OF_WINDOW_IN_BYTES * window_index

        window = audio[start_frame:start_frame + LENGTH_OF_WINDOW_IN_BYTES]
        if speaker == previous_speaker:
            segments_by_speaker[speaker][-1] += window
        else:
            segments_by_speaker[speaker].append(window)

    return segments_by_speaker, speaker_means, speaker_covariances
Example #5
0
def diarize(inputfile,
            outputfile='',
            num_speakers=0,
            buffer_secs=0,
            to_csv=True,
            plot=False):
    if outputfile == '':
        outputfile = inputfile[:-4] + '.diarized.csv'
    try:
        wav_source = True
        if inputfile.lower()[-4:] != '.wav':  # Creates a temporary WAV
            wav_source = False  # if input is MP3
            temp_filename = inputfile.split('/')[-1] + str(
                datetime.now()).replace(' ', '__').replace(':', '_') + '.wav'
            wav_path = '/var/tmp/' + temp_filename  # Pathname for temp WAV
            subprocess.call(
                ['ffmpeg', '-y', '-i', inputfile,
                 wav_path])  # '-y' option overwrites existing file if present
        else:
            wav_path = inputfile
        os.chdir(here)
        print('Processing ...')
        output = aS.speakerDiarization(wav_path,
                                       numOfSpeakers=num_speakers,
                                       PLOT=plot)
        os.chdir(working_dir)
        output = list(output)
        class_rows = class_list_to_time_rows(output, buffer_secs)
        if wav_source == False:
            os.remove(wav_path)
        if to_csv == True:
            if outputfile == '':
                outputfile = inputfile[:-4] + '.diarized.csv'
            with open(outputfile, 'w') as csv_fo:
                csv_writer = csv.writer(csv_fo)
                csv_writer.writerows(class_rows)
    except:
        print "Unexpected error:", sys.exc_info()[0]
        raise
Example #6
0
def speakerDiarizationWrapper(inputFile, numSpeakers, useLDA):
    if useLDA:
        aS.speakerDiarization(inputFile, numSpeakers, plot_res=True)
    else:
        aS.speakerDiarization(inputFile, numSpeakers, lda_dim=0, plot_res=True)
def main(argv):
    if argv[1] == "-shortTerm":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            F = audioFeatureExtraction.stFeatureExtraction(
                x, Fs, 0.050 * Fs, 0.050 * Fs)
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "short-term feature extraction: {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-classifyFile":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            aT.fileClassification("snakehit.wav", "svmSM", "svm")
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-mtClassify":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            [flagsInd, classesAll,
             acc] = aS.mtFileClassification("snakehit.wav", "svmSM", "svm",
                                            False, '')
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-hmmSegmentation":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            aS.hmmSegmentation('snakehit.wav', 'hmmRadioSM', False, '')
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-silenceRemoval":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            segments = aS.silenceRemoval(x,
                                         Fs,
                                         0.050,
                                         0.050,
                                         smoothWindow=1.0,
                                         Weight=0.3,
                                         plot=False)
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Silence removal \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-thumbnailing":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            [A1, A2, B1, B2,
             Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0,
                                             15.0)  # find thumbnail endpoints
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Thumbnail \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-noLDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.readAudioFile("snakehit.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            aS.speakerDiarization("snakehit.wav", 4, LDAdim=0, PLOT=False)
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-LDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.readAudioFile("snakehit.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            aS.speakerDiarization("snakehit.wav", 4, PLOT=False)
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
Example #8
0
def get_ans(audio_file,
            save_file='ans.json',
            person_num=2,
            silence_len_ms=300,
            silence_threshold=-70,
            lang=baidu_aip.Language.CHINESE):
    sound = AudioSegment.from_file(audio_file, format='wav')
    print('totally' + str(len(sound)) + 'ms')
    chunks = split_on_silence(sound,
                              min_silence_len=silence_len_ms,
                              silence_thresh=silence_threshold)
    for c in chunks:
        print(c.duration_seconds)
    chunks = [i for i in chunks if i.duration_seconds > 0.5]
    count = 0
    for c in chunks:
        with open(get_name(count), 'wb') as f:
            c.export(f, 'wav')
        count += 1

    # # print(audio.n_frames())  # 没用
    # audio = AudioFile(audio_file=audio_file)
    # print('Result from pocketsphinx:')
    # for phrase in audio:
    #     segments = phrase.segments(detailed=True) # => "[('forward', -617, 63, 121)]"
    #     if len(segments) == 0:
    #         continue
    #     print(segments)
    #     print('\tStart&End', segments[0][2], segments[-1][3])

    print("BAIDU:")
    selected_chunks = []
    for t in range(len(chunks)):
        ans = baidu_aip.get_wav_ans(get_name(t), lang)
        if 'result' in ans:
            print('\t' + str(t) + '\t' + '\t'.join(ans['result']))
            selected_chunks.append({
                'chunk': chunks[t],
                'result': '\t'.join(ans['result']),
                'time': chunks[t].duration_seconds,
                'person': 0
            })  # 所有有识别结果的都放到了这里

    total = AudioSegment.silent(0)
    start_list = []
    for t in range(len(selected_chunks)):
        dur = selected_chunks[t]['time']
        start_list.append(int(total.duration_seconds + 0.5) * 2)
        total = total.append(selected_chunks[t]['chunk'], 0)
        selected_chunks[t].pop('chunk')
        print('length after added: ', total.duration_seconds)
        total = total.append(
            AudioSegment.silent(
                (int(total.duration_seconds + 1.6) - total.duration_seconds) *
                1000), 0)
        print('length after added silent: ', total.duration_seconds)
    start_list.append(int(total.duration_seconds + 0.5) * 2)
    total.export(get_name('linked'), 'wav')
    recognize_ans = list(
        au.speakerDiarization(get_name('linked'),
                              person_num,
                              mt_size=0.5,
                              mt_step=0.5,
                              st_win=0.1))
    for i in range(len(start_list) - 1):
        selected_chunks[i]['person'] = str(
            int(get_mode(recognize_ans[start_list[i]:start_list[i + 1]])))
    if save_file:
        with open(save_file, 'w') as f:
            json.dump(selected_chunks, f, indent=2)
    return selected_chunks
def speakerDiarizationWrapper(inputFile, numSpeakers, useLDA):
    if useLDA:
        aS.speakerDiarization(inputFile, numSpeakers, plot_res=True)
    else:
        aS.speakerDiarization(inputFile, numSpeakers, lda_dim=0, plot_res=True)
Example #10
0
import pyAudioAnalysis.audioSegmentation as au


if __name__ == '__main__':
    a = au.speakerDiarization('2.wav', 2, mt_size=0.3, mt_step=0.05, st_win=0.05)
    print(a)
from func import split, create_model_LSTM, split_audio_file, individual_emo, sp0_emo

predictor = {
    0: 'neutral',
    1: 'calm',
    2: 'happy',
    3: 'sad',
    4: 'angry',
    5: 'fearful',
    6: 'disgust',
    7: 'surprised'
}

np.set_printoptions(threshold=np.inf)
a = audioSegmentation.speakerDiarization(
    '/home/shubham/Documents/SymbiosisHackathon/TataData/1.wav',
    2,
    mt_step=0.05)

print(a)

b = split(a)
print(b)

split_audio_file(b,
                 '/home/shubham/Documents/SymbiosisHackathon/TataData/1.wav')

# model = create_model_LSTM()
# model.load_weights('/home/shubham/Documents/SymbiosisHackathon/Model_A.h5')

# y, sr = librosa.load('/home/shubham/Documents/SymbiosisHackathon/newSong7.wav')
# mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T,axis=0)
Example #12
0
def speakerDiarizationWrapper(inputFile, numSpeakers, useLDA):
    if useLDA:
        aS.speakerDiarization(inputFile, numSpeakers, PLOT=True)
    else:
        aS.speakerDiarization(inputFile, numSpeakers, LDAdim=0, PLOT=True)
def main(argv):
	if argv[1] == "-shortTerm":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)
			t1 = time.clock()
			F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "short-term feature extraction: {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-classifyFile":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			aT.fileClassification("diarizationExample.wav", "svmSM","svm")
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-mtClassify":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			[flagsInd, classesAll, acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM", "svm", False, '')
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-hmmSegmentation":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False, '')             
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-silenceRemoval":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)				
			t1 = time.clock()
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			segments = aS.silenceRemoval(x, Fs, 0.050, 0.050, smoothWindow = 1.0, Weight = 0.3, plot = False)
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Silence removal \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-thumbnailing":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()
			[A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0, 15.0)	# find thumbnail endpoints			
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Thumbnail \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-diarization-noLDA":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()		
			aS.speakerDiarization("diarizationExample.wav", 4, LDAdim = 0, PLOT = False)
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-diarization-LDA":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()		
			aS.speakerDiarization("diarizationExample.wav", 4, PLOT = False)
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)