def audio_analysis (filename): print filename downsample = 1 samplerate = 44100 / downsample win_s = 4096 / downsample # fft size hop_s = 512 / downsample # hop size s = source(filename, samplerate, hop_s) samplerate = s.samplerate tolerance = 0.8 pitch_o = pitch("yin", win_s, hop_s, samplerate) pitch_o.set_unit("freq") pitch_o.set_tolerance(tolerance) pitches = [] confidences = [] time_stamp=[] # total number of frames read total_frames = 0 while True: samples, read = s() pitch1 = pitch_o(samples)[0] #pitch = int(round(pitch)) confidence = pitch_o.get_confidence() #if confidence < 0.8: pitch = 0. # print "%f %f %f" % (total_frames / float(samplerate), pitch1, confidence) time_stamp+=[(total_frames/float(samplerate))] pitches += [pitch1] confidences += [confidence] total_frames += read if read < hop_s: break if 0: sys.exit(0) #print time_stamp # Invoking aubiocut to detect when a word of spoken. sub = subprocess.Popen(['python', 'aubiocut', filename], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out = sub.communicate()[0] # Importing Regular Expression Modules for extracting the output[timestamp] of Aubiocut import re timestamps=re.findall("\d+.\d+\d+\d+\d+", out) # print timestamps extracted_voice=[] import math for i in timestamps: i=float(i) for j in range(len(time_stamp)): #Using the floor functions the timestamp is extracted when speakers spoke a word. temp1=math.floor(i*10)/10 temp2=math.floor((time_stamp[j])*10)/10 #print str(temp1)+ " and "+str(temp2) # if pitch >10000 then it is considered Noice in our environment. if temp1==temp2 and pitches[j]<10000.0: #print "True"+str(j)+pitches[j] extracted_voice+=[pitches[j]] # print extracted_voice avg=0.0 for i in extracted_voice: avg+=i avg=avg/(len(extracted_voice)) print "Average Pitch of Extracted Voice: "+ str(avg) formantf = get_formants(filename) for i in formantf: if i != 0: formantf = i break output_final="" gender=clf.classify([avg]) output_final="Predicted Gender : "+gender+"\n" age_predict=ageclassify(avg, VocalTractLength(formantf)) output_final=output_final+"And "+age_predict # print "Gender : "+gender #print pitches # from numpy import array, ma # import matplotlib.pyplot as plt # from demo_waveform_plot import get_waveform_plot, set_xlabels_sample2time # skip = 1 # pitches = array(pitches[skip:]) # confidences = array(confidences[skip:]) # times = [t * hop_s for t in range(len(pitches))] # fig = plt.figure() # ax1 = fig.add_subplot(311) # ax1 = get_waveform_plot(filename, samplerate = samplerate, block_size = hop_s, ax = ax1) # plt.setp(ax1.get_xticklabels(), visible = False) # ax1.set_xlabel('') # def array_from_text_file(filename, dtype = 'float'): # import os.path # from numpy import array # filename = os.path.join(os.path.dirname(__file__), filename) # return array([line.split() for line in open(filename).readlines()], # dtype = dtype) # ax2 = fig.add_subplot(312, sharex = ax1) # import sys, os.path # ground_truth = os.path.splitext(filename)[0] + '.f0.Corrected' # if os.path.isfile(ground_truth): # ground_truth = array_from_text_file(ground_truth) # true_freqs = ground_truth[:,2] # true_freqs = ma.masked_where(true_freqs < 2, true_freqs) # true_times = float(samplerate) * ground_truth[:,0] # ax2.plot(true_times, true_freqs, 'r') # ax2.axis( ymin = 0.9 * true_freqs.min(), ymax = 1.1 * true_freqs.max() ) # # plot raw pitches # ax2.plot(times, pitches, '--g') # # plot cleaned up pitches # cleaned_pitches = pitches # #cleaned_pitches = ma.masked_where(cleaned_pitches < 0, cleaned_pitches) # #cleaned_pitches = ma.masked_where(cleaned_pitches > 120, cleaned_pitches) # cleaned_pitches = ma.masked_where(confidences < tolerance, cleaned_pitches) # ax2.plot(times, cleaned_pitches, '.-') # #ax2.axis( ymin = 0.9 * cleaned_pitches.min(), ymax = 1.1 * cleaned_pitches.max() ) # #ax2.axis( ymin = 55, ymax = 70 ) # plt.setp(ax2.get_xticklabels(), visible = False) # ax2.set_ylabel('f0 (Hz)') # # plot confidence # ax3 = fig.add_subplot(313, sharex = ax1) # # plot the confidence # ax3.plot(times, confidences) # # draw a line at tolerance # ax3.plot(times, [tolerance]*len(confidences)) # ax3.axis( xmin = times[0], xmax = times[-1]) # ax3.set_ylabel('condidence') # set_xlabels_sample2time(ax3, times[-1], samplerate) # plt.show() # #plt.savefig(os.path.basename(filename) + '.svg') return output_final
def audio_analysis(filename): print type(filename) # filename = file_name downsample = 1 samplerate = 44100 / downsample win_s = 4096 / downsample # fft size hop_s = 512 / downsample # hop size s = source(filename, samplerate, hop_s) samplerate = s.samplerate tolerance = 0.8 pitch_o = pitch("yin", win_s, hop_s, samplerate) pitch_o.set_unit("freq") pitch_o.set_tolerance(tolerance) pitches = [] confidences = [] time_stamp = [] # total number of frames read total_frames = 0 while True: samples, read = s() pitch = pitch_o(samples)[0] #pitch = int(round(pitch)) confidence = pitch_o.get_confidence() #if confidence < 0.8: pitch = 0. print "%f %f %f" % (total_frames / float(samplerate), pitch, confidence) time_stamp += [(total_frames / float(samplerate))] pitches += [pitch] confidences += [confidence] total_frames += read if read < hop_s: break if 0: sys.exit(0) #print time_stamp # Invoking aubiocut to detect when a word of spoken. sub = subprocess.Popen(['python', 'aubiocut', filename], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out = sub.communicate()[0] # Importing Regular Expression Modules for extracting the output[timestamp] of Aubiocut import re timestamps = re.findall("\d+.\d+\d+\d+\d+", out) print timestamps extracted_voice = [] import math for i in timestamps: i = float(i) for j in range(len(time_stamp)): #Using the floor functions the timestamp is extracted when speakers spoke a word. temp1 = math.floor(i * 10) / 10 temp2 = math.floor((time_stamp[j]) * 10) / 10 #print str(temp1)+ " and "+str(temp2) # if pitch >10000 then it is considered Noice in our environment. if temp1 == temp2 and pitches[j] < 10000.0: #print "True"+str(j)+pitches[j] extracted_voice += [pitches[j]] print extracted_voice avg = 0.0 for i in extracted_voice: avg += i avg = avg / (len(extracted_voice)) print "Average Pitch of Extracted Voice: " + str(avg) gender = clf.classify([avg]) #print pitches from numpy import array, ma import matplotlib.pyplot as plt from demo_waveform_plot import get_waveform_plot, set_xlabels_sample2time skip = 1 pitches = array(pitches[skip:]) confidences = array(confidences[skip:]) times = [t * hop_s for t in range(len(pitches))] fig = plt.figure() ax1 = fig.add_subplot(311) ax1 = get_waveform_plot(filename, samplerate=samplerate, block_size=hop_s, ax=ax1) plt.setp(ax1.get_xticklabels(), visible=False) ax1.set_xlabel('') def array_from_text_file(filename, dtype='float'): import os.path from numpy import array filename = os.path.join(os.path.dirname(__file__), filename) return array([line.split() for line in open(filename).readlines()], dtype=dtype) ax2 = fig.add_subplot(312, sharex=ax1) import sys, os.path ground_truth = os.path.splitext(filename)[0] + '.f0.Corrected' if os.path.isfile(ground_truth): ground_truth = array_from_text_file(ground_truth) true_freqs = ground_truth[:, 2] true_freqs = ma.masked_where(true_freqs < 2, true_freqs) true_times = float(samplerate) * ground_truth[:, 0] ax2.plot(true_times, true_freqs, 'r') ax2.axis(ymin=0.9 * true_freqs.min(), ymax=1.1 * true_freqs.max()) # plot raw pitches ax2.plot(times, pitches, '--g') # plot cleaned up pitches cleaned_pitches = pitches #cleaned_pitches = ma.masked_where(cleaned_pitches < 0, cleaned_pitches) #cleaned_pitches = ma.masked_where(cleaned_pitches > 120, cleaned_pitches) cleaned_pitches = ma.masked_where(confidences < tolerance, cleaned_pitches) ax2.plot(times, cleaned_pitches, '.-') #ax2.axis( ymin = 0.9 * cleaned_pitches.min(), ymax = 1.1 * cleaned_pitches.max() ) #ax2.axis( ymin = 55, ymax = 70 ) plt.setp(ax2.get_xticklabels(), visible=False) ax2.set_ylabel('f0 (Hz)') # plot confidence ax3 = fig.add_subplot(313, sharex=ax1) # plot the confidence ax3.plot(times, confidences) # draw a line at tolerance ax3.plot(times, [tolerance] * len(confidences)) ax3.axis(xmin=times[0], xmax=times[-1]) ax3.set_ylabel('condidence') set_xlabels_sample2time(ax3, times[-1], samplerate) plt.show() #plt.savefig(os.path.basename(filename) + '.svg') return gender
print extracted_voice avg=0.0 for i in extracted_voice: avg+=i avg=avg/(len(extracted_voice)) print "Average Pitch of Extracted Voice: "+ str(avg) formantf = get_formants(filename) for i in formantf: if i != 0: formantf = i break ageclassify(avg, VocalTractLength(formantf)) clf.classify([avg]) skip = 1 pitches = array(pitches[skip:]) confidences = array(confidences[skip:]) times = [t * hop_s for t in range(len(pitches))] fig = plt.figure() ax1 = fig.add_subplot(311) ax1 = get_waveform_plot(filename, samplerate = samplerate, block_size = hop_s, ax = ax1) plt.setp(ax1.get_xticklabels(), visible = False) ax1.set_xlabel('') def array_from_text_file(filename, dtype = 'float'): import os.path