def get_mfcc_feat(filename, winstep=0.01, nfilt=40, numcep=13, preemph=0.95, appendEnergy=True, begin_index=-1, end_index=-1): """ Compute MFCC features from an audio signal. :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param numcep: the number of cepstrum to return, default 13 :param nfilt: the number of filters in the filterbank, default 26. :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.95. :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. :param begin_index: the frame when record begin :param end_index: the frame when record end :return mfcc_feat: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. :return fbank_feat: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. :return normalization_feature: the 39-dimensions feature after finite difference and normalization """ (rate, signal) = wav.read(filename) if begin_index != -1: signal = signal[begin_index * 44100 * 0.02:end_index * 44100 * 0.02] # print signal # print type(signal[0]) if rate == 44100: winlen = 0.02 nfft = 1024 elif rate == 16000: winlen = 0.025 nfft = 512 mfcc_feat = mfcc.mfcc(signal=signal, samplerate=rate, winlen=winlen, winstep=winstep, nfft=nfft, nfilt=nfilt, numcep=numcep, preemph=preemph, appendEnergy=appendEnergy) numpy.savetxt("result.txt", mfcc_feat, delimiter=",") fbank_feat = mfcc.logfbank(signal=signal, samplerate=rate, winlen=winlen, winstep=winstep, nfft=nfft, nfilt=nfilt, preemph=preemph) # numpy.savetxt("result2.txt", fbank_feat, delimiter=",") normalization_feature = mfcc.normalization(mfcc_feat) return mfcc_feat, fbank_feat, normalization_feature
def get_class_proba_sound(clf_gb, clf_rf, data, interval, numFrames): coeff = data.shape[0] / numFrames features_nb = 2587 X = np.zeros((len(interval), features_nb)) for i, inter in enumerate(interval): name, (beg, end) = inter space = end - beg - 9 #print 'space:', space limit = 40 data_interval = np.zeros(40 * coeff) if (space > limit): data_interval = data[(beg - 1) * coeff:(beg + limit - 1) * coeff] else: data_interval[:space * coeff] = data[(beg - 1) * coeff:(end - 10) * coeff] ceps, mspec, spec = mf.mfcc(data_interval) #print 'ceps, mspec, spec', ceps.shape, mspec.shape, spec.shape X[i, :2587] = ceps.reshape(2587) #X[i, 2587:10547] = mspec.reshape(7960) #X[i, 10547:22835] = spec.reshape(12288) X = np.nan_to_num(X) X = X.clip(min=-100) return clf_gb.predict_proba(X), clf_rf.predict_proba(X)
def __init__(self, audio): self.audio = audio self.window_length = 0.025 self.window_step = 0.01 self.num_cepstrals = 12 self.nfft = 512 self.mfcc = mfcc.mfcc()
def record(self, button): text, ok = QtWidgets.QInputDialog().getText(self, "Emotion Recognition", "Enter Your Name:") if ok and text: self.speaker = text print "Speaker: " + self.speaker wFile = "Test/" + self.speaker + ".wav" start_record(self, wFile, 5, self.state) state = "File saved as " + self.speaker self.repaint() print 'File saved as ' + self.speaker + ".wav" self.add_files() fFile = "feat/Test/" + self.speaker + ".htk" try: # call MFCC feature extraction subroutine f, E, fs = mfcc(wFile, self.winlen, self.ovrlen, self.pre_coef, self.nfilter, self.nftt) # VAD part if self.opts == 1: f = vad_thr(f, E) # Energy threshold based VAD [comment this line if you would like to plugin the rVAD labels] elif self.opts == 0: l = numpy.loadtxt('..corresponding vad label file') # [Plugin the VAD label generated by rVAD matlab] if (len(f) - len(l)) == 1: # 1-[end-frame] correction [matlab/python] l = numpy.append(l, l[-1:, ]) elif (len(f) - len(l)) == -1: l = numpy.delete(l, -1) if (len(l) == len(f)) and (len(numpy.argwhere(l == 1)) != 0): idx = numpy.where(l == 1) f = f[idx] else: print "mismatch frames between: label and feature files or no voice-frame in VAD" exit() # Zero mean unit variance normalize after VAD f = cmvn(f) # write the VAD+normalized features in file if not os.path.exists(os.path.dirname(fFile)): # create director for the feature file os.makedirs(os.path.dirname(fFile)) writehtk(fFile, f, 0.01) except: print("Fail1..%s ---> %s\n" % (wFile, fFile)) if button.text() == "Live Detection": self.test_emotion(name=self.speaker) else: self.state = "Not valid Name" self.repaint() return
def collect_mfcc(arg, dirname, fnames): for fname in fnames: if fname.endswith(".wav"): signal, FS = waveio.wave_from_file(os.path.join(dirname, fname)) feature = list(mfcc.mfcc(signal, FS)) feature_collection[fname] = [] for point in feature: feature_collection[fname].append(lbg.cluster_point(point, mu))
def calc_mfcc(arg, dirname, fnames): for fname in fnames: if fname.endswith(".wav"): signal, FS = waveio.wave_from_file(os.path.join( dirname, fname)) feature = list(mfcc(signal, FS)) print "File: %s generate %d number of MFCC" % (fname, len(feature)) feature_collection["feature"] += feature
def make_fake_mfcc(construct_record): """制作fake录音\n 输入:fake录音的字符串描述\n 输出:(label,fake_mfcc_sequence)元组\n e.g 输入123,输出(['1','2','3'],123的mfcc_sequence)""" label = [] return_mfcc_sequence = [] for digit in construct_record: label.append(digit) return_mfcc_sequence += mfcc.mfcc( readwave.read_wave("./fake/" + digit + ".wav"), "fm") return label, return_mfcc_sequence
def get_mfcc_feat(filename, winstep=0.01, nfilt=40, numcep=13, preemph=0.95, appendEnergy=True, begin_index=-1, end_index=-1): """ Compute MFCC features from an audio signal. :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param numcep: the number of cepstrum to return, default 13 :param nfilt: the number of filters in the filterbank, default 26. :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.95. :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. :param begin_index: the frame when record begin :param end_index: the frame when record end :return mfcc_feat: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. :return fbank_feat: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. :return normalization_feature: the 39-dimensions feature after finite difference and normalization """ (rate, signal) = wav.read(filename) if begin_index != -1: signal = signal[begin_index * 44100 * 0.02:end_index * 44100 * 0.02] # print signal # print type(signal) if rate == 44100: winlen = 0.02 nfft = 1024 elif rate == 16000: winlen = 0.025 nfft = 512 mfcc_feat = mfcc.mfcc(signal=signal, samplerate=rate, winlen=winlen, winstep=winstep, nfft=nfft, nfilt=nfilt, numcep=numcep, preemph=preemph, appendEnergy=appendEnergy) numpy.savetxt("result.txt", mfcc_feat, delimiter=",") fbank_feat = mfcc.logfbank(signal=signal, samplerate=rate, winlen=winlen, winstep=winstep, nfft=nfft, nfilt=nfilt, preemph=preemph) # numpy.savetxt("result2.txt", fbank_feat, delimiter=",") normalization_feature = mfcc.normalization(mfcc_feat) return mfcc_feat, fbank_feat, normalization_feature
def loop(self): self.working = True while self.working: frames = self.input_audio.read() if self.input_frames is not None: self.input_frames = numpy.append(self.input_frames, frames) mfcc_features = mfcc.mfcc(frames) self.state.step(self, mfcc_features) if self.output_queue is not None: output = self.network.capture_output(1) self.output_queue.put(output) self.input_audio.close()
def compare(arg, dirname, fnames): for fname in fnames: if test in fname: # Get MFCC for each test sample test_features = mfcc(signal = waveio.wave_from_file(os.path.join(dirname, fname))[0], fs = waveio.wave_from_file(os.path.join(dirname, fname))[1]) # Record overhead and score for each test sample versus each template.(Each sample is of a dict corresponding to each template) overhead[fname]= {} score[fname] = {} for template in template_feature.keys(): time_start = time() score[fname][template] = dtw(template_feature[template], test_features) overhead[fname][template] = time() - time_start
def training(nfiltbank): nSpeaker = 1 nCentroid = 16 codebooks_mfcc = np.empty((nSpeaker, nfiltbank, nCentroid)) directory = 'train' fname = str() fname = '/shefali1.wav' (fs, s) = read(directory + fname) mel_coeff = mfcc(s, fs, nfiltbank) codebook = lbg(mel_coeff, nCentroid) print('Training complete') return (codebook)
def training(nfiltbank): nSpeaker = 8 nCentroid = 16 codebooks = np.empty((nSpeaker, nfiltbank, nCentroid)) directory = 'train' fname = str() for i in range(nSpeaker): fname = '/s' + str(i + 1) + '.wav' (fs, s) = read(directory + fname) mel_coeff = mfcc(s, fs, nfiltbank) codebooks[i, :, :] = lbg(mel_coeff, nCentroid) print('Training complete') return (codebooks)
def predict(model_name, filename): samples, sample_rate = audio.decode_wav(filename) coefficients = mfcc(samples, sample_rate) coefficients = tf.reshape(tf.cast(coefficients, tf.float32), [1, 98, 13, 1]) model = tf.keras.models.load_model(model_name) prediction = model.predict(coefficients) if np.argmax(prediction) >= 0.8: output = constant.train_commands[np.argmax(prediction)] else: output = None return output
def train(n): k = 16 import wave code = [] for i in range(n): wav_file=wave.open('{}'.format(i+1) + '.wav') raw_frames = wav_file.readframes(-1) num_frames = wav_file.getnframes() num_channels = wav_file.getnchannels() sample_rate = wav_file.getframerate() sample_width = wav_file.getsampwidth() temp_buffer = np.empty((num_frames, num_channels, 4), dtype=np.uint8) raw_bytes = np.frombuffer(raw_frames, dtype=np.uint8) temp_buffer[:, :, :sample_width] = raw_bytes.reshape(-1, num_channels, sample_width) temp_buffer[:, :, sample_width:] = \ (temp_buffer[:, :, sample_width-1:sample_width] >> 7) * 255 frames = temp_buffer.view('<i4').reshape(temp_buffer.shape[:-1]) #sample_rate = wav_file.getframerate() sample_rate = 16000; signal=frames[::3,0] sample_rate, signal = cut.cut(sample_rate, signal) # sample_rate, signal = scipy.io.wavfile.read('{}'.format(i+1) + '.wav') v = mfcc(signal, sample_rate) code.append(vqlbg(v, k)) print(i) if n == 3: d1 = disteu(code[0], code[1]) d2 = disteu(code[1], code[2]) d3 = disteu(code[2], code[0]) dk = [] dk.append(sum(d1.min(1)) / d1.shape[0]) dk.append(sum(d2.min(1)) / d2.shape[0]) dk.append(sum(d3.min(1)) / d3.shape[0]) dmax = max(dk) dmin = min(dk) else: dmax = 0 dmin = 0 return code, dmax, dmin
def compare_files(filenames, save_preprocessed_files): class Sim: def __init__(self, name1, name2): self.name1 = name1 self.name2 = name2 characteristics = {} for filename in filenames: print("processing {}".format(filename)) signal = sound_file.load(filename) signal = pre_processing.process(signal) if save_preprocessed_files: sound_file.save(filename[:-4] + "_p.wav", signal) characteristics[filename] = mfcc.mfcc(signal) metrics = _metrics_of_characteristics(characteristics, filenames) _simple_metrics_comparison(metrics) _dbscan_metrics_comparison(metrics) return metrics
def get_mpc(path,filename,ncep,dataset): signal, rate = get_audio_info(path) mfcc_feat,mpc_feat,spectrum = mfcc(signal,fs=rate,nfft=512,nceps=ncep,nwin=512) feat = mpc_feat[:,:ncep] mean_feat = np.mean(feat, axis=0) covvar = np.cov(feat,rowvar=0) cov_feat = [] #cov_feat = np.empty([0,0]) for i in range(0,covvar.shape[0]): for j in range(0,i+1): cov_feat.append(covvar[i][j]) cov_feat = np.array(cov_feat) feature_vector = np.append(mean_feat,cov_feat) data.append(feature_vector) if dataset == 'ismir': labels.append(re.split('-', filename, maxsplit=1)[0]) elif dataset == 'gtzan': labels.append(re.split('\.', filename, maxsplit=1)[0]) else: print 'unrecognised dataset type' sys.exit()
def test(isOnline, test_data): if isOnline: test_sequence = mfcc.mfcc(record.record(), "fast_mode") hmm(test_sequence, 3, "test0", True) else: for i in xrange(10): ### 俊优的test sequence for index in test_data[0]: name = ".\\txtDictionary\\junyo_" + str(i) + "_" + str( index) + ".txt" sequence = np.loadtxt(name) sequence = sequence[1:] hmm(sequence, i, name, False) ### 健炜的test sequence for index in test_data[1]: name = ".\\txtDictionary\\" + str(i) + "_" + str( index) + ".txt" sequence = np.loadtxt(name) sequence = sequence[1:] hmm(sequence, i, name, False)
def test(name, code): import wave wav_file = wave.open(name + '.wav') raw_frames = wav_file.readframes(-1) num_frames = wav_file.getnframes() num_channels = wav_file.getnchannels() sample_rate = wav_file.getframerate() sample_width = wav_file.getsampwidth() temp_buffer = np.empty((num_frames, num_channels, 4), dtype=np.uint8) raw_bytes = np.frombuffer(raw_frames, dtype=np.uint8) temp_buffer[:, :, :sample_width] = raw_bytes.reshape( -1, num_channels, sample_width) temp_buffer[:, :, sample_width:] = \ (temp_buffer[:, :, sample_width-1:sample_width] >> 7) * 255 frames = temp_buffer.view('<i4').reshape(temp_buffer.shape[:-1]) #sample_rate = wav_file.getframerate()/3 sample_rate = 16000 signal = frames[::3, 0] sample_rate, signal = cut.cut(sample_rate, signal) v = mfcc(signal, sample_rate) distmin = float('inf') dist = 0 for i in range(len(code)): d = disteu(v, code[i]) dist += sum(d.min(1)) / d.shape[0] dist = dist / len(code) # dist = sum(d.min(1)) / d.shape[0] # # if dist < distmin: # distmin = dist print(dist) if dist < 4.0: return True else: return False
def GetNumber(self, AudioIn, Fs): #对输入信号求mfcc ReceivedSignal = mfcc.mfcc(AudioIn, Fs) value = [] #开始进行模板匹配 value.append(dtw.dtw(self.mfcc0, ReceivedSignal)) value.append(dtw.dtw(self.mfcc1, ReceivedSignal)) value.append(dtw.dtw(self.mfcc2, ReceivedSignal)) value.append(dtw.dtw(self.mfcc3, ReceivedSignal)) value.append(dtw.dtw(self.mfcc4, ReceivedSignal)) value.append(dtw.dtw(self.mfcc5, ReceivedSignal)) value.append(dtw.dtw(self.mfcc6, ReceivedSignal)) value.append(dtw.dtw(self.mfcc7, ReceivedSignal)) value.append(dtw.dtw(self.mfcc8, ReceivedSignal)) value.append(dtw.dtw(self.mfcc9, ReceivedSignal)) value.append(dtw.dtw(self.mfccAuther, ReceivedSignal)) #print(value) number = value.index(min(value)) #获取最接近的值 #print(number) return number
def features(audio, rate, ncp=ncep): audio = clip(audio.astype(np.float64)) feats = np.empty(ncp + nother) coefs = mfcc.mfcc(audio, samplerate=rate, numcep=ncp, nfilt=2 * ncp, nfft=4096, winlen=4096 / rate, winstep=4096 / 3 / rate) feats[0:ncp] = np.mean(coefs, axis=0) fft = np.abs(np.fft.rfft(audio)) pwr = fft**2 feats[ncp] = ms.gmean(pwr) / np.mean(pwr) freq = np.fft.rfftfreq(len(audio), 1 / rate) feats[ncp + 1] = np.sum(fft * freq) / np.sum(fft) feats[ncp + 2] = np.sqrt(np.mean(np.square(audio))) return feats
def get_class_proba_sound(clf_gb, clf_rf, data, interval, numFrames): coeff = data.shape[0] / numFrames features_nb = 2587 X = np.zeros((len(interval), features_nb)) for i, inter in enumerate(interval): name, (beg, end) = inter space = end - beg - 9 #print 'space:', space limit = 40 data_interval = np.zeros(40*coeff) if(space > limit): data_interval = data[(beg-1)*coeff:(beg+limit-1)*coeff] else: data_interval[:space*coeff] = data[(beg-1)*coeff:(end-10)*coeff] ceps, mspec, spec = mf.mfcc(data_interval) #print 'ceps, mspec, spec', ceps.shape, mspec.shape, spec.shape X[i, :2587] = ceps.reshape(2587) #X[i, 2587:10547] = mspec.reshape(7960) #X[i, 10547:22835] = spec.reshape(12288) X = np.nan_to_num(X) X = X.clip(min=-100) return clf_gb.predict_proba(X), clf_rf.predict_proba(X)
min_cost = node.cost for node in trellis: if node.cost > min_cost + PRUNING_THRESHOLD: node.active = False node.cost = sys.float_info.max # pruning done, now start next loop, taking in next mfcc in the sequence. # collect result pointer = trellis[-1].backPointer result = [] while pointer != 0: result.insert(0, back_pointer_table[pointer].meaning) pointer = back_pointer_table[pointer].previous_index for i in range(len(result)): ob = result[i].find('\"') result[i] = result[i][ob+1] return result if __name__ == '__main__': generated_trellis = trellis_generate("network_gen.txt") print('trellis construct done') # continuous_speech = mfcc.mfcc(record.record(), "fm") continuous_speech = mfcc.mfcc(readwave.read_wave("./fake/9.wav"),"fm") continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/1.wav"),"fm") continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/0.wav"),"fm") continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/9.wav"),"fm") continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/6.wav"), "fm") continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/7.wav"), "fm") continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/8.wav"), "fm") getresult = back_pointer_dtw(continuous_speech, generated_trellis) print(getresult)
def SystemInit(self): #输入模板信号 self.sample_rate, self.signal1 = scipy.io.wavfile.read('1.wav') self.sample_rate, self.signal2 = scipy.io.wavfile.read('2.wav') self.sample_rate, self.signal3 = scipy.io.wavfile.read('3.wav') self.sample_rate, self.signal4 = scipy.io.wavfile.read('4.wav') self.sample_rate, self.signal5 = scipy.io.wavfile.read('5.wav') self.sample_rate, self.signal6 = scipy.io.wavfile.read('6.wav') self.sample_rate, self.signal7 = scipy.io.wavfile.read('7.wav') self.sample_rate, self.signal8 = scipy.io.wavfile.read('8.wav') self.sample_rate, self.signal9 = scipy.io.wavfile.read('9.wav') self.sample_rate, self.signal0 = scipy.io.wavfile.read('0.wav') self.sample_rate, self.signalAuther = scipy.io.wavfile.read( 'Auther.wav') #对模板信号求mfcc self.mfcc0 = mfcc.mfcc(self.signal0, self.sample_rate) self.mfcc1 = mfcc.mfcc(self.signal1, self.sample_rate) self.mfcc2 = mfcc.mfcc(self.signal2, self.sample_rate) self.mfcc3 = mfcc.mfcc(self.signal3, self.sample_rate) self.mfcc4 = mfcc.mfcc(self.signal4, self.sample_rate) self.mfcc5 = mfcc.mfcc(self.signal5, self.sample_rate) self.mfcc6 = mfcc.mfcc(self.signal6, self.sample_rate) self.mfcc7 = mfcc.mfcc(self.signal7, self.sample_rate) self.mfcc8 = mfcc.mfcc(self.signal8, self.sample_rate) self.mfcc9 = mfcc.mfcc(self.signal9, self.sample_rate) self.mfccAuther = mfcc.mfcc(self.signalAuther, self.sample_rate)
clf = pickle.load(open('model_svm', 'rb')) #读取标签对应的乐器名称 instruments = pickle.load(open('names', 'rb')) #读取测试文件目录 musicpath = input("Enter the path of the folder containing testing audios :") print("Computing...") filename = [name for name in os.listdir(musicpath) if re.match(r'.*wav', name)] result = [] for i in range(len(filename)): fs, music = wavfile.read(os.path.join(musicpath, filename[i])) if music.dtype.type not in [np.int16, np.int32, np.float32]: raise TypeError( 'only 16bit,32bit PCM and 32bit floating-point wavefiles are supported' ) #只支持这三种精度的wav文件,注意不支持8bit与24bit #预处理 frameTime = 0.02 #(s) #f:处理后的各帧对象 frameLength:以取样点数表示的帧长 f, frameLength = preprocessing(music, fs, frameTime) #计算出mfcc ceps = mfcc(f, fs, frameLength) result.append(clf.predict(ceps)) #得到对每一帧的预测结果 for i in range(len(result)): result[i] = np.argmax(np.bincount(result[i])) #每帧投票,选出得票最多的结果 print('The instrument played in "' + filename[i] + '" is detected as: ' + instruments[result[i]])
import readwave import mfcc import numpy as np for digit in range(0, 10): for index in range(0, 5): file_name = "./junyo_iso_11_1/" + str(digit) + "_" + str( index) + ".wav" speech = mfcc.mfcc(readwave.read_wave(file_name), "fm") np.savetxt(str(digit) + "_" + str(index) + ".txt", speech)
def main(): sample_rate, signal11 = scipy.io.wavfile.read('1.wav') sample_rate, signal12 = scipy.io.wavfile.read('2.wav') sample_rate, signal13 = scipy.io.wavfile.read('3.wav') sample_rate, signal14 = scipy.io.wavfile.read('4.wav') sample_rate, signal15 = scipy.io.wavfile.read('5.wav') sample_rate, signal16 = scipy.io.wavfile.read('6.wav') sample_rate, signal17 = scipy.io.wavfile.read('7.wav') sample_rate, signal18 = scipy.io.wavfile.read('8.wav') sample_rate, signal19 = scipy.io.wavfile.read('9.wav') sample_rate, signal10 = scipy.io.wavfile.read('0.wav') print('ok1') mfcc10 = mfcc(signal10, sample_rate) mfcc11 = mfcc(signal11, sample_rate) mfcc12 = mfcc(signal12, sample_rate) mfcc13 = mfcc(signal13, sample_rate) mfcc14 = mfcc(signal14, sample_rate) mfcc15 = mfcc(signal15, sample_rate) mfcc16 = mfcc(signal16, sample_rate) mfcc17 = mfcc(signal17, sample_rate) mfcc18 = mfcc(signal18, sample_rate) mfcc19 = mfcc(signal19, sample_rate) print('ok2') value = [] sample_rate, signalinput = scipy.io.wavfile.read('1.wav') ReceivedSignal = mfcc(signalinput, sample_rate) value.append(dtw(mfcc10, ReceivedSignal)) value.append(dtw(mfcc11, ReceivedSignal)) value.append(dtw(mfcc12, ReceivedSignal)) value.append(dtw(mfcc13, ReceivedSignal)) value.append(dtw(mfcc14, ReceivedSignal)) value.append(dtw(mfcc15, ReceivedSignal)) value.append(dtw(mfcc16, ReceivedSignal)) value.append(dtw(mfcc17, ReceivedSignal)) value.append(dtw(mfcc18, ReceivedSignal)) value.append(dtw(mfcc19, ReceivedSignal)) print(value) number = value.index(min(value)) print(number)
import matplotlib.pyplot as plt import numpy as np from mfcc import mfcc from read import read from stft import stft fname = "sineSweep.wav" (srate, data) = read(fname, "mono") N = 1024 X= stft(data, N) X = np.abs(X) X = X[:N/2+1] X = mfcc(X, 44100) #mag to dec conversion #X = 10 * np.log10(X) plt.imshow(X[1:], interpolation='nearest', aspect='auto', origin='lower') plt.show()
getwav = [line.replace('\n','') for line in wavedata] # Remove to '\n' print('Get wave data list: ', getwav) """ Read wave datas """ wavdata_, fs_, time_shift = read_wav_data(getwav[0]) ''' Compute features Used functions: MFCC, Delta MFCC, AveMFCC, AveDeltaMFCC MFCC: mfcc.mfcc(line) Delta MFCC: mfcc.delta(line) Averages: line.mean(axis=1) ''' # MFCC wavdata, fs, time_song, ceps = mfcc.mfcc(wavdata_, fs_) # Delta MFCC delta_mfcc = np.empty((0, len(ceps)), int) for line in range(len(ceps.T)): delta_mfcc = np.append(delta_mfcc, [mfcc.delta(ceps.T[line])], axis=0) # MFCC Average ave_mfcc = [] ave_mfcc.append(ceps.T.mean(axis=1)) # Delta MFCC Average ave_DeltaMfcc = [] ave_DeltaMfcc.append(delta_mfcc.mean(axis=1))
def get_mfcc(filename): samples, sample_rate = decode_wav(filename) return mfcc(samples, sample_rate)
ubmDir= 'GMM' + str(nmix) wFile = "Test_dec.wav" start_record(wFile,3) fFile= "feat/Test/"+wFile+".htk" with open(ubmDir + '/' + 'ubm') as f: print "lood ubm .. %s" %(f) ubm_mu, ubm_cov, ubm_w = pickle.load(f) winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.02, 0.97, 26 , 512 opts=1 try: #call MFCC feature extraction subroutine f, E, fs=mfcc(wFile,winlen, ovrlen, pre_coef, nfilter, nftt) # VAD part if opts == 1: f=vad_thr(f,E) #Energy threshold based VAD [comment this line if you would like to plugin the rVAD labels] elif opts == 0: l=numpy.loadtxt('..corresponding vad label file'); #[Pluggin the VAD label generated by rVAD matlab] if (len(f) - len(l)) ==1: #1-[end-frame] correction [matlab/python] l= numpy.append(l,l[-1:,]) elif (len(f) -len(l)) == -1: l=numpy.delete(l,-1)
fft = numpy.fft.rfft(buf.data, n_fft, axis=-1) # Power spectrum and flitering mspec = numpy.log(numpy.dot(fft.real**2 + fft.imag**2, fbank.T)) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) # The C0 term is removed as it is the constant term buf.data = scipy.fftpack.realtransforms.dct(mspec, type=2, norm='ortho', axis=-1)[1:nceps + 1] buf.data = buf.data[:, numpy.newaxis] next.send(buf) except GeneratorExit: next.close() if __name__ == '__main__': import numpy import ringBuffer from testPipeline import audio_reader, sink from mfcc import mfcc output = sink() buf_cep = ringBuffer.ring_buffer(output, 2, 0, 13) cep = mfcc(buf_cep) buf_sig = ringBuffer.ring_buffer(cep, 200, 80) inp = audio_reader(buf_sig, "taaa.wav") next(inp)
def test_emotion(self, name): if name: value = name else: file = self.files.currentItem() print file.text() value = file.text().split(".")[0] fFile = "feat/Test/" + value + ".htk" self.emotion = test(fFile, self.Tardest, self.ubm_mu, self.ubm_cov, self.ubm_w) self.result = "Detected as: " self.repaint() start_play(self.emotion + "_dec.wav") self.state = "Say Yes or No" self.repaint() wFile = "Test_dec.wav" start_record(self, wFile, 3, self.state) fFile = "feat/Test/" + wFile + ".htk" try: # call MFCC feature extraction subroutine f, E, fs = mfcc(wFile, self.winlen, self.ovrlen, self.pre_coef, self.nfilter, self.nftt) # VAD part if self.opts == 1: f = vad_thr(f, E) # Energy threshold based VAD [comment this line if you would like to plugin the rVAD labels] elif self.opts == 0: l = numpy.loadtxt('..corresponding vad label file') # [Plugin the VAD label generated by rVAD matlab] if (len(f) - len(l)) == 1: # 1-[end-frame] correction [matlab/python] l = numpy.append(l, l[-1:, ]) elif (len(f) - len(l)) == -1: l = numpy.delete(l, -1) if (len(l) == len(f)) and (len(numpy.argwhere(l == 1)) != 0): idx = numpy.where(l == 1) f = f[idx] else: print "mismatch frames between: label and feature files or no voice-frame in VAD" exit() # Zero mean unit variance normalize after VAD f = cmvn(f) # write the VAD+normalized features in file if not os.path.exists(os.path.dirname(fFile)): # create director for the feature file os.makedirs(os.path.dirname(fFile)) #print("%s --> %s\n" %(wFile,fFile)) writehtk(fFile, f, 0.01) except: print("Fail ..%s ---> %s\n" % (wFile, fFile)) decision = test_decision(fFile, 'DEC3_Tau10.0', self.ubm_mu, self.ubm_cov, self.ubm_w) if decision == "YES": start_play(self.emotion + ".wav") self.result = "" self.emotion = "" self.state = "" self.repaint() return
def feature_fetch(target, dirname, fnames): for fname in fnames: if target in fname: with open(os.path.join(dirname, fname)) as f: signal, FS = waveio.wave_from_file(f) samples.append(mfcc.mfcc(signal, FS))
import os import waveio from mfcc import mfcc from dtw import dtw from time import time voice_dir = "voiceMaterial" templates = ["strawberry", "apple", "beef", "egg", "lemon", "mushroom", "noodle", "orange", "spaghetti", "spam", "watermelon"] test = "mushroom" # Get MFCC for each template, using the 1st sample of them print "Get MFCC for each template, begin..." template_feature = {} for template in templates: template_feature[template] = mfcc(signal = waveio.wave_from_file(os.path.join(voice_dir, template+"1.wav"))[0], fs = waveio.wave_from_file(os.path.join(voice_dir, template+"1.wav"))[1]) print "Get MFCC for each template, finish." overhead = {} score = {} def compare(arg, dirname, fnames): for fname in fnames: if test in fname: # Get MFCC for each test sample test_features = mfcc(signal = waveio.wave_from_file(os.path.join(dirname, fname))[0], fs = waveio.wave_from_file(os.path.join(dirname, fname))[1]) # Record overhead and score for each test sample versus each template.(Each sample is of a dict corresponding to each template) overhead[fname]= {} score[fname] = {} for template in template_feature.keys(): time_start = time()
from scipy.io.wavfile import read from vqlbg import EUDistance from mfcc import mfcc from train_id import training nSpeaker = 1 nfiltbank = 20 (codebooks) = training(nfiltbank) directory = 'test' fname = str() nCorrect_MFCC = 0 def minDistance(features, codebooks): speaker = 0 distmin = np.inf for k in range(np.shape(codebooks)[0]): D = EUDistance(features, codebooks[k, :, :]) dist = np.sum(np.min(D, axis=1)) / (np.shape(D)[0]) if dist < distmin: distmin = dist speaker = k return speaker fname = '/s1.wav' (fs, s) = read(directory + fname) mel_coefs = mfcc(s, fs, nfiltbank) sp_mfcc = minDistance(mel_coefs, codebooks) print('Speaker', ' is matches with speaker', (sp_mfcc + 1))
def get_mfccs(index, paths=csv['path']): import mfcc as mf mf.set_param_values() mfccs = np.asarray([mf.mfcc(i) for i in paths[index]], dtype=np.float32) return mfccs
import argparse import traceback import numpy as np if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input", default=None, type=str, help="Input path for mp3 file") parser.add_argument("--output", default=None, type=str, help="Output path for numpy array") args = parser.parse_args() try: audio, sr = librosa.load(args.input, sr=16000, mono=True, dtype=np.float32) except Exception as ex: print("Load autio failed:", ex) traceback.print_exc(file=sys.stdout) sys.exit(-1) audio = (audio * 32768).astype(np.int16) output = mfcc.mfcc(sr, audio) np.save(args.output, output)
def train_model_on_sound(wav_list): gestures = {'vattene':0, 'vieniqui':1, 'perfetto':2, 'furbo':3, 'cheduepalle':4, 'chevuoi':5, 'daccordo':6, 'seipazzo':7, 'combinato':8, 'freganiente':9, 'ok':10, 'cosatifarei':11, 'basta':12, 'prendere':13, 'noncenepiu':14, 'fame':15, 'tantotempo':16, 'buonissimo':17, 'messidaccordo':18, 'sonostufo':19} dataX = [] for wav in wav_list: path = re.sub('\_audio.wav$', '', wav) print '\n', '##############' print path[-25:] sample = VideoMat(path, True) sk = Skelet(sample) rate, data = get_data(wav) labels = sample.labels coeff = data.shape[0] / sample.numFrames interval = get_interval(data, sample.numFrames) #comment to use true interval data interval = interval_analysis(interval, sk) interval = [['', (beg, end)] for name, (beg, end) in interval if end-beg>10] features_nb = 2588 #Change to add mspec and spec features for value in labels: if value != 0: name, (beg, end) = value for inter in interval: name2, (beg2, end2) = inter if beg2>beg-5 and end2<end+5: space = end2 - beg2 - 9 limit = 40 data_interval = np.zeros(40*coeff) if(space > limit): data_interval = data[(beg2-1)*coeff:(beg2+limit-1)*coeff] else: data_interval[:space*coeff] = data[(beg2-1)*coeff:(end2-10)*coeff] ceps, mspec, spec = mf.mfcc(data_interval) #print ceps.shape, mspec.shape, spec.shape data_tmp = np.zeros(features_nb) data_tmp[0] = gestures[name] data_tmp[1:2588] = ceps.reshape(2587) #data_tmp[2588:10548] = mspec.reshape(7960) #data_tmp[1273:13561] = spec.reshape(12288) data_tmp = np.nan_to_num(data_tmp) dataX.append(copy.copy(data_tmp)) break print len(dataX) data = np.asarray(dataX) Y = data[:, 0] X = data[:, 1:2588] X = X.clip(min=-100) clf = GradientBoostingClassifier(n_estimators=200, verbose=2, max_depth=7, min_samples_leaf=10, min_samples_split=20, random_state=0) clf = clf.fit(X, Y) pickle.dump(clf, open('gradient_boosting_model_sound.pkl','wb')) clf = RandomForestClassifier(n_estimators=300, criterion='entropy', min_samples_split=10, min_samples_leaf=1, verbose=2, random_state=1) #n_jobs=2 clf = clf.fit(X, Y) pickle.dump(clf, open('random_forest_model_sound.pkl','wb')) clf = ExtraTreesClassifier(n_estimators=300, min_samples_split=10, min_samples_leaf=1, verbose=2, random_state=1) #n_jobs=2 clf = clf.fit(X, Y) pickle.dump(clf, open('extra_trees_model_sound.pkl','wb'))
RATE=16000 RECORD_SECONDS = 2 CHUNKSIZE = 1024 # initialize portaudio p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=2, rate=RATE, input=True, frames_per_buffer=CHUNKSIZE) frames = [] print "Record" for _ in range(0, int(RATE / CHUNKSIZE * RECORD_SECONDS)): data = stream.read(CHUNKSIZE) frames.append(np.fromstring(data, dtype=np.int16)) print "EIEI" numpydata = np.hstack(frames) signal = numpydata.astype(np.int64) mfcc_feat = mfcc.mfcc(signal, samplerate=RATE, winlen=0.03, winstep=0.01, numcep=12, nfilt=15, lowfreq=300, highfreq=3700) print mfcc_feat.shape # close stream stream.stop_stream() stream.close() p.terminate()
#~~~~~~~~~~~~~ # Real world demo #~~~~~~~~~~~~~ from time import time M_set = [64, 128, 256] for M in M_set: print "Generating VQ with M == %d" % M timer = time() # Train VQ mu, clusters = vq_generator( dirname="/home/magodo/code/voiceMaterial/word", M=M, return_clusters=True) # Store the trained VQ(tuple of mu(list of 256 different vectors) and clusters(dict of 256 different vector sets)) import pickle with open("VQ.pkl" + "-%d" % M, "wb") as f: pickle.dump((mu, clusters), f) train_time = (time() - timer) / 60.0 print "Used %f minutes" % train_time # Perform the trained VQ to a demo.wav signal, FS = waveio.wave_from_file("demo.wav") feature = list(mfcc(signal, FS)) vq_array = [] for i in feature: vq_array.append(lbg.cluster_point(i, mu)) print vq_array