Ejemplo n.º 1
0
 def audio_classify(self, clip):
     if not os.path.isfile("weights/hmm"):
         print(
             "pretrained model doesnt exist, training may take few minutes")
         self._train_hmm()
     else:
         print("pretrained model exists, using that")
     flags, classes, _, _ = audioSegmentation.hmmSegmentation(
         clip.to_soundarray(fps=44100), "weights/hmm")
     # classification is per second, lets group sections
     counts = []
     prev_item = None
     first_item = None
     for item in flags:
         if prev_item == None:
             counts.append(1)
             prev_item = item
             first_item = item
         elif item == prev_item:
             counts[-1] += 1
         elif item != prev_item:
             counts.append(1)
             prev_item = item
     counts = np.cumsum(np.array(counts))
     durations = [[], []]
     next_item = first_item
     for i in range(len(counts)):
         if i == 0:
             durations[next_item].append([0, counts[i]])
         else:
             durations[next_item].append([counts[i - 1], counts[i]])
         next_item = (next_item + 1) % 2
     return dict(zip(classes, durations))
Ejemplo n.º 2
0
def split_call_into_speakers(call_file, out_loc):
    '''
    Attempts to split a call file into different segments each time the speaker changes using
    speaker diarization. This method assumes there are two speakers in the file (sales and customer)
    and will cut out dial tones and any receptionists before the two speakers' conversation.
    '''
    # set output directories
    no_rings_out_dir = os.path.join(out_loc, 'calls_no_ringtones')
    if not os.path.exists(no_rings_out_dir):
        os.makedirs(no_rings_out_dir)
    diarized_out_dir = os.path.join(out_loc, 'calls_split_by_speaker')
    if not os.path.exists(diarized_out_dir):
        os.makedirs(diarized_out_dir)

    # load in raw audio file
    print(call_file)
    raw_audio = AudioSegment.from_file(call_file, 'wav')
    file_name = os.path.splitext(os.path.basename(call_file))[0]

    # uses trained HMM to determine where the ringtones are and only use audio from after
    # last detected ring and exports intermediate file
    curr_path = os.path.dirname(os.path.realpath(__file__))
    ring_labels = aS.hmmSegmentation(call_file,
                                     os.path.join(curr_path, 'hmmRingDetect'),
                                     False)
    segs, flags = aS.flags2segs(
        ring_labels[0],
        1.0)  # 1.0 is the mid-term window step from above model
    no_rings_audio = raw_audio[segs[-1, 0] * 1000:segs[-1, 1] * 1000]
    temp_out_loc = os.path.join(no_rings_out_dir, file_name) + '.wav'
    no_rings_audio.export(temp_out_loc, format='wav')

    # split on speakers now setting num speakers to 2
    diarized = aS.speakerDiarization(temp_out_loc, 2, mtSize=0.5, mtStep=0.1)

    # determine which label was given to customer and salesperson
    cust = diarized[0]

    # output the segments
    no_rings_audio = AudioSegment.from_file(
        temp_out_loc, format='wav')  # update segment so indexing is right
    segs, flags = aS.flags2segs(diarized, 0.1)  #mtstep from above
    curr_call_out_base = os.path.join(diarized_out_dir, file_name)
    if not os.path.exists(curr_call_out_base):
        os.makedirs(curr_call_out_base)
    for seg in range(segs.shape[0]):
        # skip segments shorter than 1s (usually 'um' or something)
        if segs[seg, 1] - segs[seg, 0] < 1:
            continue
        out_seg = no_rings_audio[segs[seg, 0] * 1000:segs[seg, 1] * 1000]
        if flags[seg] == cust:
            out_seg.export(os.path.join(curr_call_out_base,
                                        str(seg) + '_cust.wav'),
                           format='wav')
        else:
            out_seg.export(os.path.join(curr_call_out_base,
                                        str(seg) + '_sales.wav'),
                           format='wav')
Ejemplo n.º 3
0
def segmentclassifyFileWrapperHMM(wavFile, hmmModelName):
    gtFile = wavFile.replace(".wav", ".segments")
    aS.hmmSegmentation(wavFile,
                       hmmModelName,
                       plot_res=True,
                       gt_file_name=gtFile)
Ejemplo n.º 4
0
def main(argv):
    if argv[1] == "-shortTerm":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            F = audioFeatureExtraction.stFeatureExtraction(
                x, Fs, 0.050 * Fs, 0.050 * Fs)
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "short-term feature extraction: {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-classifyFile":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            aT.fileClassification("snakehit.wav", "svmSM", "svm")
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-mtClassify":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            [flagsInd, classesAll,
             acc] = aS.mtFileClassification("snakehit.wav", "svmSM", "svm",
                                            False, '')
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-hmmSegmentation":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            aS.hmmSegmentation('snakehit.wav', 'hmmRadioSM', False, '')
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-silenceRemoval":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            [Fs, x] = audioBasicIO.readAudioFile("snakehit.wav")
            segments = aS.silenceRemoval(x,
                                         Fs,
                                         0.050,
                                         0.050,
                                         smoothWindow=1.0,
                                         Weight=0.3,
                                         plot=False)
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Silence removal \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-thumbnailing":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            [A1, A2, B1, B2,
             Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0,
                                             15.0)  # find thumbnail endpoints
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Thumbnail \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-noLDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.readAudioFile("snakehit.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            aS.speakerDiarization("snakehit.wav", 4, LDAdim=0, PLOT=False)
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-LDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.readAudioFile("snakehit.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            aS.speakerDiarization("snakehit.wav", 4, PLOT=False)
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
Ejemplo n.º 5
0
def segmentclassifyFileWrapperHMM(wavFile, hmmModelName):
    gtFile = wavFile.replace(".wav", ".segments")
    aS.hmmSegmentation(wavFile, hmmModelName, plot_res=True,
                       gt_file_name=gtFile)
Ejemplo n.º 6
0
aT.featureAndTrain([root_data_path + "SM/speech", root_data_path + "SM/music"],
                   1.0, 1.0, 0.2, 0.2, "svm", "temp", True)

print("\n\n\n * * * TEST 5 * * * \n\n\n")
[flagsInd, classesAll, acc, CM] = aS.mtFileClassification(
    root_data_path + "pyAudioAnalysis/data//scottish.wav",
    root_data_path + "pyAudioAnalysis/data/svmSM", "svm", True,
    root_data_path + 'pyAudioAnalysis/data/scottish.segments')

print("\n\n\n * * * TEST 6 * * * \n\n\n")
aS.trainHMM_fromFile(root_data_path + 'radioFinal/train/bbc4A.wav',
                     root_data_path + 'radioFinal/train/bbc4A.segments',
                     'hmmTemp1', 1.0, 1.0)
aS.trainHMM_fromDir(root_data_path + 'radioFinal/small', 'hmmTemp2', 1.0, 1.0)
aS.hmmSegmentation(root_data_path + 'pyAudioAnalysis/data//scottish.wav',
                   'hmmTemp1', True, root_data_path +
                   'pyAudioAnalysis/data//scottish.segments')  # test 1
aS.hmmSegmentation(root_data_path + 'pyAudioAnalysis/data//scottish.wav',
                   'hmmTemp2', True, root_data_path +
                   'pyAudioAnalysis/data//scottish.segments')  # test 2

print("\n\n\n * * * TEST 7 * * * \n\n\n")
aT.featureAndTrainRegression(root_data_path +
                             "pyAudioAnalysis/data/speechEmotion",
                             1,
                             1,
                             0.050,
                             0.050,
                             "svm_rbf",
                             "temp.mod",
                             compute_beat=False)
Ejemplo n.º 7
0
def segmentclassifyFileWrapperHMM(wavFile, hmmModelName):
    gtFile = wavFile.replace(".wav", ".segments")
    flags_ind, classes_all, _, _  = aS.hmmSegmentation(wavFile, hmmModelName, plot_res=False,
                       gt_file_name=gtFile)
    print('flags_ind:', flags_ind, len(flags_ind))
    print('classes_all:', classes_all)
def main(argv):
	if argv[1] == "-shortTerm":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)
			t1 = time.clock()
			F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "short-term feature extraction: {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-classifyFile":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			aT.fileClassification("diarizationExample.wav", "svmSM","svm")
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-mtClassify":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			[flagsInd, classesAll, acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM", "svm", False, '')
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-hmmSegmentation":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False, '')             
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-silenceRemoval":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)				
			t1 = time.clock()
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			segments = aS.silenceRemoval(x, Fs, 0.050, 0.050, smoothWindow = 1.0, Weight = 0.3, plot = False)
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Silence removal \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-thumbnailing":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()
			[A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0, 15.0)	# find thumbnail endpoints			
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Thumbnail \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-diarization-noLDA":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()		
			aS.speakerDiarization("diarizationExample.wav", 4, LDAdim = 0, PLOT = False)
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-diarization-LDA":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()		
			aS.speakerDiarization("diarizationExample.wav", 4, PLOT = False)
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)
Ejemplo n.º 9
0
print("\n\n\n * * * TEST 3 * * * \n\n\n")
[Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav")
x = audioBasicIO.stereo2mono(x)
specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stChromagram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)

print("\n\n\n * * * TEST 4 * * * \n\n\n")
aT.featureAndTrain([root_data_path +"SM/speech",root_data_path + "SM/music"], 1.0, 1.0, 0.2, 0.2, "svm", "temp", True)

print("\n\n\n * * * TEST 5 * * * \n\n\n")
[flagsInd, classesAll, acc, CM] = aS.mtFileClassification(root_data_path + "pyAudioAnalysis/data//scottish.wav", root_data_path + "pyAudioAnalysis/data/svmSM", "svm", True, root_data_path + 'pyAudioAnalysis/data/scottish.segments')

print("\n\n\n * * * TEST 6 * * * \n\n\n")
aS.trainHMM_fromFile(root_data_path + 'radioFinal/train/bbc4A.wav', root_data_path + 'radioFinal/train/bbc4A.segments', 'hmmTemp1', 1.0, 1.0)	
aS.trainHMM_fromDir(root_data_path + 'radioFinal/small', 'hmmTemp2', 1.0, 1.0)
aS.hmmSegmentation(root_data_path + 'pyAudioAnalysis/data//scottish.wav', 'hmmTemp1', True, root_data_path + 'pyAudioAnalysis/data//scottish.segments')				# test 1
aS.hmmSegmentation(root_data_path + 'pyAudioAnalysis/data//scottish.wav', 'hmmTemp2', True, root_data_path + 'pyAudioAnalysis/data//scottish.segments')				# test 2

print("\n\n\n * * * TEST 7 * * * \n\n\n")
aT.featureAndTrainRegression(root_data_path + "pyAudioAnalysis/data/speechEmotion", 1, 1, 0.050, 0.050, "svm_rbf", "temp.mod", compute_beat=False)
print(aT.fileRegression(root_data_path + "pyAudioAnalysis/data/speechEmotion/01.wav", "temp.mod", "svm_rbf"))

print("\n\n\n * * * TEST 8 * * * \n\n\n")
aT.featureAndTrainRegression(root_data_path + "pyAudioAnalysis/data/speechEmotion", 1, 1, 0.050, 0.050, "svm", "temp.mod", compute_beat=False)
print(aT.fileRegression(root_data_path + "pyAudioAnalysis/data/speechEmotion/01.wav", "temp.mod", "svm"))

print("\n\n\n * * * TEST 9 * * * \n\n\n")
aT.featureAndTrainRegression(root_data_path + "pyAudioAnalysis/data/speechEmotion", 1, 1, 0.050, 0.050, "randomforest", "temp.mod", compute_beat=False)
print(aT.fileRegression(root_data_path + "pyAudioAnalysis/data/speechEmotion/01.wav", "temp.mod", "randomforest"))

def evaluateSpeechMusic(fileName,
                        modelName,
                        method="svm",
                        postProcess=0,
                        postProcessModelName="",
                        PLOT=False):
    # load grount truth file (matlab annotation)

    matFile = fileName.replace(".wav", "_true.mat")
    if os.path.isfile(matFile):
        matfile = loadmat(matFile)
        segs_gt = matfile["segs_r"]
        classes_gt1 = matfile["classes_r"]
        classes_gt = []
        for c in classes_gt1[0]:
            if c == "M":
                classes_gt.append("music")
            if c == "S" or c == "E":
                classes_gt.append("speech")
        flagsIndGT, classesAllGT = audioSegmentation.segs2flags(
            [s[0] for s in segs_gt], [s[1] for s in segs_gt], classes_gt, 1.0)
    if method == "svm" or method == "randomforest" or method == "gradientboosting" or method == "extratrees":
        # speech-music segmentation:
        [flagsInd, classesAll, acc,
         CM] = audioSegmentation.mtFileClassification(fileName, modelName,
                                                      method, False, '')
    elif method == "hmm":
        [flagsInd, classesAll, _,
         _] = audioSegmentation.hmmSegmentation(fileName,
                                                modelName,
                                                PLOT=False,
                                                gtFileName="")
    elif method == "cnn":
        WIDTH_SEC = 2.4
        [Fs, x] = io.readAudioFile(fileName)
        x = io.stereo2mono(x)
        [flagsInd, classesAll,
         CNNprobs] = mtCNN_classification(x, Fs, WIDTH_SEC, 1.0,
                                          RGB_singleFrame_net, SOUND_mean_RGB,
                                          transformer_RGB, classNamesCNN)

    for i in range(flagsIndGT.shape[0]):
        flagsIndGT[i] = classesAll.index(classesAllGT[flagsIndGT[i]])

    #plt.plot(flagsIndGT, 'r')
    #plt.plot(flagsInd)
    #plt.show()

    #print classesAllGT, classesAll
    if postProcess >= 1:
        # medfilt here!
        flagsInd = scipy.signal.medfilt(flagsInd, 11)
    if postProcess >= 2:  #load HMM
        try:
            fo = open(postProcessModelName, "rb")
        except IOError:
            print "didn't find file"
            return
        try:
            hmm = cPickle.load(fo)
            classesAll = cPickle.load(fo)
        except:
            fo.close()

#Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);    # feature extraction
#[Features, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))
        flagsInd = hmm.predict(CNNprobs)
        flagsInd = scipy.signal.medfilt(flagsInd, 3)

    if PLOT:
        plt.plot(flagsInd + 0.01)
        plt.plot(flagsIndGT, 'r')
        plt.show()
    CM = np.zeros((2, 2))
    for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
        CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1
    print CM
    return CM, classesAll