def load_data_shared(ind): #Training and testing data timit_data_train = genfromtxt('timit_data_1280_train.csv', delimiter=',') timit_vwlname_train = genfromtxt('timit_vwlname_1280_train.csv', delimiter=',') timit_vwlname_train[:] = [x - 1 for x in timit_vwlname_train] timit_data_test = genfromtxt('timit_data_1280_test.csv', delimiter=',') timit_vwlname_test = genfromtxt('timit_vwlname_1280_test.csv', delimiter=',') timit_vwlname_test[:] = [x - 1 for x in timit_vwlname_test] fs = 16000 datalen = 1280 narr = np.array([13, 26, 39]); #Number of features in each frame i=0; j=0; trainfeature=np.zeros((len(timit_data_train), (datalen*100/fs - 1)*narr[ind])) for x in timit_data_train: fbank_flat = logfbank(x,fs).flatten() mfcc_flat = mfcc(x,fs).flatten() if ind == 0: trainfeature[i,:] = mfcc_flat elif ind == 1: trainfeature[i,:] = fbank_flat else: trainfeature[i,:] = np.concatenate((mfcc_flat, fbank_flat)) i = i+1 testfeature=np.zeros((len(timit_data_test), (datalen*100/fs - 1)*narr[ind])) for x in timit_data_test: fbank_flat = logfbank(x,fs).flatten() mfcc_flat = mfcc(x,fs).flatten() if ind == 0: testfeature[j,:] = mfcc_flat elif ind == 1: testfeature[j,:] = fbank_flat else: testfeature[j,:] = np.concatenate((mfcc_flat, fbank_flat)) j = j+1 training_data = (trainfeature, timit_vwlname_train) test_data = (testfeature, timit_vwlname_test) # For now, I am using test data as validating data. Should change later. validation_data = test_data def shared(data): """Place the data into shared variables. This allows Theano to copy the data to the GPU, if one is available. """ shared_x = theano.shared( np.asarray(data[0], dtype=theano.config.floatX), borrow=True) shared_y = theano.shared( np.asarray(data[1], dtype=theano.config.floatX), borrow=True) return shared_x, T.cast(shared_y, "int32") return [shared(training_data), shared(validation_data), shared(test_data)]
def svm_baseline(): #### Change here ind = 0; # 0 for mfcc, 1 for filterbank, 2 for both narr = np.array([13, 26, 39]); # corresponding length of feature in a frame #Training and testing data timit_data_train = genfromtxt('timit_data_1280_train.csv', delimiter=',') timit_vwlname_train = genfromtxt('timit_vwlname_1280_train.csv', delimiter=',') timit_vwlname_train[:] = [x - 1 for x in timit_vwlname_train] timit_data_test = genfromtxt('timit_data_1280_test.csv', delimiter=',') timit_vwlname_test = genfromtxt('timit_vwlname_1280_test.csv', delimiter=',') timit_vwlname_test[:] = [x - 1 for x in timit_vwlname_test] fs = 16000 datalen = 1280 i=0; j=0; trainfeature=np.zeros((len(timit_data_train), (datalen*100/fs - 1)*narr[ind])) for x in timit_data_train: fbank_flat = logfbank(x,fs).flatten() mfcc_flat = mfcc(x,fs).flatten() if ind == 0: trainfeature[i,:] = mfcc_flat elif ind == 1: trainfeature[i,:] = fbank_flat else: trainfeature[i,:] = np.concatenate((mfcc_flat, fbank_flat)) i = i+1 testfeature=np.zeros((len(timit_data_test), (datalen*100/fs - 1)*narr[ind])) for x in timit_data_test: fbank_flat = logfbank(x,fs).flatten() mfcc_flat = mfcc(x,fs).flatten() if ind == 0: testfeature[j,:] = mfcc_flat elif ind == 1: testfeature[j,:] = fbank_flat else: testfeature[j,:] = np.concatenate((mfcc_flat, fbank_flat)) j = j+1 training_data = (list(trainfeature), timit_vwlname_train) test_data = (list(testfeature), timit_vwlname_test) # train clf = svm.SVC() clf.fit(training_data[0], training_data[1]) # test predictions = [int(a) for a in clf.predict(test_data[0])] num_correct = sum(int(a == y) for a, y in zip(predictions, test_data[1])) print "Using svm_baseline classifier:" print "%s of %s values correct. %s percent " % (num_correct, len(test_data[1]), (num_correct*100)/len(test_data[1]))
def getFeatures(signal, rate): """ Extracts Important Vocal Features author: chris """ if signal.shape[0] > mem_cut_off: mfcc,fbank = getFeatures(signal[mem_cut_off:], rate) return np.concatenate((fs.mfcc(signal[:mem_cut_off],rate),mfcc)), np.concatenate((fs.logfbank(signal,rate),fbank)) else: return fs.mfcc(signal,rate), fs.logfbank(signal,rate)
def getFeatures(signal, rate): """ Extracts Important Vocal Features author: chris """ if signal.shape[0] > mem_cut_off: mfcc, fbank = getFeatures(signal[mem_cut_off:], rate) return np.concatenate((fs.mfcc(signal[:mem_cut_off], rate), mfcc)), np.concatenate( (fs.logfbank(signal, rate), fbank)) else: return fs.mfcc(signal, rate), fs.logfbank(signal, rate)
def cal_bic(wavfilename, sadfilename): sample_rate, wav = wavfile.read(wavfilename) mfcc_feat = features.mfcc(wav, sample_rate) ref = getsad_ref(sadfilename) an_win_mfcc = mfcc_cut_vad_an_win(mfcc_feat, ref) [time, bic_value] = bic(an_win_mfcc) return time, bic_value
def fill(self, class_id): """Fills internal structer with new training samples. Do not call directly. :param class_id: class identification """ # get training samples for i in range(len(self.sel_files)): row = self.sel_files[0] samples = self.all_files[row].samples feat = mfcc(samples, 16000, winlen=0.030, appendEnergy=False, VAD=simpleVAD) # add two symptoms from the middle self.X.append(feat[int(len(feat) / 2 - 1)]) self.y.append(class_id) self.X.append(feat[int(len(feat) / 2 + 1)]) self.y.append(class_id) # clear from the list del_iter = self.file_store.get_iter(Gtk.TreePath.new_from_indices([row])) self.file_store.remove(del_iter) del self.all_files[row] # print results if Classifier.new_training(self.X, self.y): self.status_label.set_text("{0} vzorků, {1} tříd".format(len(self.X), len(self.class_names))) else: self.status_label.set_text("Klasifikátor potřebuje víc tříd")
def extract_mfcc_features(signal, win_len=0.0232, win_overlap=0.5, n_mel_bands=40, n_coefs=25, fs=48000, nfft=1024): """ Return feature vector for a one channel signal Return same features as the one defined in the paper Salamon, J., Jacoby, C., & Bello, J. (2014). A Dataset and Taxonomy for Urban Sound Research. ACM International Conference Onf Multimedia, (3). doi:10.1145/2647868.2655045 :param signal: one dimension array :param win_len: length of window to split the signal into :param win_overlap: overlap over window, 1 > win_overlap >= 0 :param n_mel_bands: numbers of mel bands to use :param n_coefs: number of dct coefs to return :param fs: signal sampling rate :return: a dict of features array """ win_step = win_len * win_overlap # 50% features = {} res = mfcc(signal, samplerate=fs, winlen=win_len, winstep = win_step, nfilt = n_mel_bands, lowfreq=0, highfreq = 22050, numcep=n_coefs, nfft=nfft) ## TODO revoir nfft.. je ne suis pas certain de comprendre a quoi ca correspond pour mel dans le papier il n'en parle pas.. surtour revoir si ca fonctionne avec nfft et fs... car bon #print("fs {}, signal.shape {}".format(fs,signal.shape)) #print(res.shape) features["minimum"] = np.min(res, axis=0) features["maximum"] = np.max(res, axis=0) features["median"] = np.median(res, axis=0) features["mean"] = np.mean(res, axis=0) features["variance"] = np.var(res, axis=0) features["skewness"] = scipy.stats.skew(res, axis=0) features["kurtosis"] = scipy.stats.kurtosis(res, axis=0) features["mean_first_diff"] = np.mean(np.diff(res, axis=0), axis=0) features["variance_first_diff"] = np.var(np.diff(res, axis=0), axis=0) features["mean_second_diff"] = np.mean(np.diff(res, axis=0, n=2), axis=0) features["var_second_diff"] = np.var(np.diff(res, axis=0, n=2), axis=0) return features
def compute_mfcc(sig, rate, winlen=0.025, winstep=0.01, numcep = 12, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, include_energy=True, snip_edges = True): if snip_edges: #snip the edges sig = snip(sig, rate, winlen, winstep) return mfcc(sig, rate, winlen, winstep, numcep, nfilt, nfft, lowfreq, highfreq, preemph, ceplifter, include_energy)
def features_from_base(basepath, order=0): (females, males) = read_speakers(basepath) # list of list (sorted) female_utterances_list = [ read_utterances(basepath, female) for female in females ] male_utterances_list = [read_utterances(basepath, male) for male in males] # utterances as Wave objects female_utterances_list = read_utterances_files(basepath, female_utterances_list, 'f') male_utterances_list = read_utterances_files(basepath, male_utterances_list, 'm') for utterances in female_utterances_list: for utterance in utterances: uttMFCCs = features.mfcc(utterance.signal, samplerate=utterance.sample_rate, numcep=19, highfreq=utterance.sample_rate / 2) if (order > 0): uttMFCCs = features.appendDeltasAllFrames(uttMFCCs, order) print(uttMFCCs.shape) print(uttMFCCs) print()
def run_tests(test_files): # Classify input data for test_file in test_files: # Read input file sampling_freq, signal = wavfile.read(test_file) # Extract MFCC features with warnings.catch_warnings(): warnings.simplefilter('ignore') features_mfcc = mfcc(signal, sampling_freq) # Define variables max_score = -float('inf') output_label = None # Run the current feature vector through all the HMM # models and pick the one with the highest score for item in speech_models: model, label = item score = model.compute_score(features_mfcc) if score > max_score: max_score = score predicted_label = label # Print the predicted output start_index = test_file.find('/') + 1 end_index = test_file.rfind('/') original_label = test_file[start_index:end_index] print('\nOriginal: ', original_label) print('Predicted:', predicted_label)
def vector_quantize(myfiles, outdir, model): #given a list of files transform them to spectral vectors and compute the KMeans VQ for f in myfiles: print "Quantizing: ", f (rate, sig) = wav.read(f) #print rate, sig.shape #get the spectral vectors mfcc_feat = mfcc(sig,rate) #print mfcc_feat.shape fbank_feat = mfcc_feat #logfbank(sig,rate) #this has the spectral vectors now #print fbank_feat.shape val = model.predict(fbank_feat) #fcomps = os.path.split(f) #file components path, filename fcomps = f.split("/") fn = fcomps[-2]+"/"+fcomps[-1].split('.')[0] + '_vq.txt' #outpath = os.path.join(fcomps[0], 'outputs') fn = os.path.join(outdir, fn) d = os.path.dirname(fn) if not os.path.exists(d): os.makedirs(d) #print fn #val = trim_background(val) #raw_input("enter...") f = open(fn, 'wb') for v in val: f.write(str(v) + '\n') f.close() print 'output vector quantized file: ', f, ' written' return
def training(): ''' Takes input signal and searches current dataset for hit. If hit, then add to correct dataset. If miss, asks user for currect input and adds to dataset. ''' print("please speak a word into the microphone") record_to_file('training.wav') print("done - result written to training.wav") (rate, sig) = wav.read("training.wav") mfcc_feat = mfcc(sig, rate) fbank_feat = logfbank(sig, rate) recording = fbank_feat[1:3, :] testing = check_for_match(recording) verify = raw_input("did you say " + testing + " ") if verify is 'y': parse_array(recording, testing) if verify is 'n': correct_word = input("what word did you mean? ") print correct_word parse_array(recording, correct_word)
def generate_testing_mfccs(myfiles, outdir): for f in myfiles: print "Generating MFCCs for: ", f (rate, sig) = wav.read(f) #print rate, sig.shape #get the spectral vectors mfcc_feat = mfcc(sig,rate) #mfcc_feat = scaler.transform(mfcc_feat) #fcomps = os.path.split(f) #file components path, filename fcomps = f.split("/") fn = fcomps[-2]+"/"+fcomps[-1].split('.')[0] + '_mfcc.txt' #outpath = os.path.join(fcomps[0], 'outputs') fn = os.path.join(outdir, fn) d = os.path.dirname(fn) if not os.path.exists(d): os.makedirs(d) f = open(fn, 'wb') final_mfccs_str="" for vector in mfcc_feat: str_mfcc = "" for element in vector: str_mfcc +=str(element)+"," str_mfcc = str_mfcc[:-1] final_mfccs_str += str_mfcc+"\n" f.write(final_mfccs_str) f.close() print 'output MFCC file: ', f, ' written' return
def __init__(self, filename): self.filename = filename self.frequency, self.sound = wavfile.read(wavFilesPath + filename) self.channel1 = self.sound[:, 0] self.channel2 = self.sound[:, 1] self.duration = len(self.sound) / self.frequency self.mfccFeatures = mfcc(self.sound, self.frequency)
def extractLow(signal): return mfcc(signal, samplerate=SAMPLING_RATE, winlen=LO_FRAME_DURATION, winstep=LO_FRAME_STEP, numcep=NUM_CEPTRUM, appendEnergy=True)
def get_data(rootdir = TIMIT_main_dir): inputs = [] targets = [] for dir_path, sub_dirs, files in os.walk(rootdir): for file in files: if (os.path.join(dir_path, file)).endswith('.wav'): wav_file_name = os.path.join(dir_path, file) input_data, f_s = sf.read(wav_file_name) # mfcc_feat = MFCC_input(mfcc(input_data,f_s)) mfcc_feat = mfcc(input_data,f_s) #Delta features delta_feat = mfcc_feat[:-1]-mfcc_feat[1:] #Delta-Delta features deltadelta_feat = delta_feat[:-1]-delta_feat[1:] #Removing the first two frames mfcc_feat = mfcc_feat[2:] delta_feat = delta_feat[1:] #Concatenating mfcc, delta and delta-delta features full_input = np.concatenate((mfcc_feat,delta_feat,deltadelta_feat), axis=1) inputs.append(np.asarray(full_input, dtype=theano.config.floatX))#Rakeshvar wants one frame along each column but i am using Lasagne text_file_name = wav_file_name[:-4] + '.txt' target_data_file = open(text_file_name) target_data = str(target_data_file.read()).lower().translate(None, '!:,".;?') # target_data = str(target_data_file.read()).lower().translate(str.maketrans('','', '!:,".;?')) target_data = target_data[8:-1]#No '.' in lexfree dictionary targets.append(target_data) return inputs, targets
def shifted_delta_cepstra(self, wav_fn, delta=1, shift=3, k_conc=3): """ :param delta: represents the time advance and delay for the sdc k_conc: is the number of blocks whose delta coefficients are concd shift: is the time shift between consecutive blocks Shifted delta cepstra are feature vectors created by concatenating delta cepstra computed across multiple speech frames. See the paper PA Torres-Carrasquillo et al (2002) Approaches to language identification using Gaussian mixture models and Shifted delta cepstral features. """ (rate, sig) = wav.read(wav_fn) mfcc_feats = mfcc(sig, rate) # len(mfcc) == 39 == 3 * (12 cepstral + 1 energy) # TODO include original cepstra as well? delta_feats = mfcc_feats[delta:] - mfcc_feats[:-delta] output_duration = delta_feats.shape[0] - shift * k_conc shifted = np.zeros( (output_duration, (k_conc + 1) * delta_feats.shape[1])) mfcc_dim = mfcc_feats.shape[1] shifted[:, 0:mfcc_dim] = mfcc_feats[:output_duration] for i in xrange(output_duration): shifted[i, mfcc_dim:] = delta_feats[i:i + k_conc * shift:shift, :].reshape( (1, -1)) logger.debug('{} --> {}'.format(mfcc_feats.shape, shifted.shape)) return shifted
def build_codebook( trgfile, codesize=32, fname=None ): # given a training file constructs the codebook using kmeans #print "Codesize is ", codesize (rate, sig) = wav.read(trgfile) print rate, sig.shape #get the spectral vectors print("MFCC generation begins") mfcc_feat = mfcc(sig, rate) print("MFCC generation ends") print mfcc_feat.shape sys.exit(0) #print("Fbank creation begins") #fbank_feat = logfbank(sig,rate) #this has the spectral vectors now #print("Fbank creation ends") #print fbank_feat.shape print "codesize = ", codesize km = KMeans(n_clusters=codesize) #km.fit(fbank_feat) km.fit(mfcc_feat) if fname != None: pickle.dump(km, open(fname, 'wb')) return km
def extractLow(signal): return mfcc(signal, samplerate = SAMPLING_RATE, winlen = LO_FRAME_DURATION, winstep = LO_FRAME_STEP, numcep = NUM_CEPTRUM, appendEnergy = True)
def main(): if len(sys.argv) < 2: sys.stderr.write('Usage: python ' + sys.argv[0] + ' lang_test.wav') sys.exit(1) file = sys.argv[1] languages = pickle.load( open('languages.dat', 'r') ) (rate,sig) = wav.read(file) # returns (sample rate, numpy.ndarray of samples) mfcc_feat = mfcc(sig,rate) mfccs_deltas = recognizer_util.get_deltas(mfcc_feat, 0) mfccs_deltas_ddeltas = recognizer_util.get_deltas(mfccs_deltas, 13) test_avg = recognizer_util.col_avg(mfccs_deltas_ddeltas) results = {} for language in languages.keys(): dist = get_distance(languages[language], test_avg) results[dist] = language sorted = results.keys() sorted.sort() print language = results[sorted[0]] sys.stdout.write(language) print
def shifted_delta_cepstra(self, wav_fn, delta=1, shift=3, k_conc=3): """ :param delta: represents the time advance and delay for the sdc k_conc: is the number of blocks whose delta coefficients are concd shift: is the time shift between consecutive blocks Shifted delta cepstra are feature vectors created by concatenating delta cepstra computed across multiple speech frames. See the paper PA Torres-Carrasquillo et al (2002) Approaches to language identification using Gaussian mixture models and Shifted delta cepstral features. """ (rate,sig) = wav.read(wav_fn) mfcc_feats = mfcc(sig,rate) # len(mfcc) == 39 == 3 * (12 cepstral + 1 energy) # TODO include original cepstra as well? delta_feats = mfcc_feats[delta:] - mfcc_feats[:-delta] output_duration = delta_feats.shape[0] - shift*k_conc shifted = np.zeros((output_duration, (k_conc + 1) * delta_feats.shape[1])) mfcc_dim = mfcc_feats.shape[1] shifted[:,0:mfcc_dim] = mfcc_feats[:output_duration] for i in xrange(output_duration): shifted[i,mfcc_dim:] = delta_feats[i:i+k_conc*shift:shift, :].reshape((1,-1)) logger.debug('{} --> {}'.format(mfcc_feats.shape, shifted.shape)) return shifted
def _gen_features(self, data_dir, outfile): """ Generates a csv file containing labeled lines for each speaker """ with open(outfile, 'w') as ohandle: melwriter = csv.writer(ohandle) speakers = os.listdir(data_dir) for spkr_dir in speakers: for soundclip in os.listdir(os.path.join(data_dir, spkr_dir)): # generate mel coefficients for the current clip clip_path = os.path.abspath( os.path.join(data_dir, spkr_dir, soundclip)) sample_rate, data = wavfile.read(clip_path) ceps = mfcc(data, sample_rate) # write an entry into the training file for the current speaker # the vector to store in csv file contains the speaker's name at the end fvec = self._mfcc_to_fvec(ceps) fvec.append(spkr_dir) logging.debug( fvec) # see the numbers [as if they make sense ] # write one row to the csv file melwriter.writerow(fvec)
def create_mfcc(method, filename): """Perform standard preprocessing, as described by Alex Graves (2012) http://www.cs.toronto.edu/~graves/preprint.pdf Output consists of 12 MFCC and 1 energy, as well as the first derivative of these. [1 energy, 12 MFCC, 1 diff(energy), 12 diff(MFCC) method is a dummy input!!""" (rate, sample) = wav.read(filename) mfcc = features.mfcc(sample, rate, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, preemph=0.97, appendEnergy=True) derivative = np.zeros(mfcc.shape) for i in range(1, mfcc.shape[0] - 1): derivative[i, :] = mfcc[i + 1, :] - mfcc[i - 1, :] out = np.concatenate((mfcc, derivative), axis=1) return out, out.shape[0]
def compute_features(filename): fs,audio_array = wav.read(filename) mfcc_25 = mfcc(audio_array,samplerate=fs,winlen=0.064,winstep=0.032,numcep=25, nfilt=40,nfft=512,lowfreq=0,highfreq=fs/2,preemph=0, ceplifter=0,appendEnergy=True) first = np.diff(mfcc_25, axis=0) second = np.diff(first, axis=0) minimum = np.amin(mfcc_25,axis=0) maximum = np.amax(mfcc_25,axis=0) median = np.median(mfcc_25,axis=0) mean = np.mean(mfcc_25,axis=0) variance = np.var(mfcc_25,axis=0) skewness = scipy.stats.skew(mfcc_25,axis=0) kurtosis = scipy.stats.kurtosis(mfcc_25,axis=0) first_mean = np.mean(first,axis=0) first_variance = variance = np.var(first,axis=0) second_mean = np.mean(second,axis=0) second_variance = variance = np.var(second,axis=0) features = np.concatenate((minimum, maximum, median, mean, variance, skewness, kurtosis, first_mean, first_variance, second_mean, second_variance), axis=0) return features
def vector_quantize( myfiles, outdir, model ): #given a list of files transform them to spectral vectors and compute the KMeans VQ for f in myfiles: print "Quantizing: ", f (rate, sig) = wav.read(f) #print rate, sig.shape #get the spectral vectors mfcc_feat = mfcc(sig, rate) #print mfcc_feat.shape fbank_feat = mfcc_feat #logfbank(sig,rate) #this has the spectral vectors now #print fbank_feat.shape val = model.predict(fbank_feat) #fcomps = os.path.split(f) #file components path, filename fcomps = f.split("/") fn = fcomps[-2] + "/" + fcomps[-1].split('.')[0] + '_vq.txt' #outpath = os.path.join(fcomps[0], 'outputs') fn = os.path.join(outdir, fn) d = os.path.dirname(fn) if not os.path.exists(d): os.makedirs(d) #print fn val = trim_background(val) #raw_input("enter...") f = open(fn, 'wb') for v in val: f.write(str(v) + '\n') f.close() print 'output vector quantized file: ', f, ' written' return
def feature_extract_mfcc(self, sound, rate): """ extract every features for training - frequency space: MFCC. pitch :return: """ reg = re.compile('(\d+)-(\d+)-(\d+).wav') #plotter.plot_frame(sound, show=True) mfcc0 = mfcc(sound.reshape(1, -1), rate, winlen=cfg.frame, winstep=cfg.step, nfft=1536, winfunc=np.hamming) mfcc0 = mfcc0 - np.mean(mfcc0) mfcc1 = delta(mfcc0, 3) mfcc2 = delta(mfcc1, 3) mfcc0 = scale(mfcc0) ''' if audio in ['01','00']: print(filename) plotter.plot_mfcc(mfcc0,'311') plotter.plot_mfcc(mfcc1,'312') plotter.plot_mfcc(mfcc2,'313') plotter.show() ''' return (mfcc0, mfcc1, mfcc2), min(len(mfcc0), 200)
def compare(control_path, exp_path): """ Compares two wav files and returns a score. Uses mel frequency ceptrum coefficients as well as dynamic time warping. :param control_path: the 'correct' wav - what you are comparing to :param exp_path: the unknown wav """ (rate, sig) = wavread(control_path) (rate2, sig2) = wavread(exp_path) x = mfcc(sig, rate) y = mfcc(sig2, rate2) dist, cost, acc = dtw.dtw(x, y, dist=lambda x, y: dtw.norm(x - y, ord=1))\ return dist
def compare(control_path, exp_path): """ Compares two wav files and returns a score. Uses mel frequency ceptrum coefficients as well as dynamic time warping. :param control_path: the 'correct' wav - what you are comparing to :param exp_path: the unknown wav """ (rate,sig) = wavread(control_path) (rate2,sig2) = wavread(exp_path) x = mfcc(sig,rate) y = mfcc(sig2,rate2) dist, cost, acc = dtw.dtw(x, y, dist=lambda x, y: dtw.norm(x - y, ord=1))\ return dist
def get_data(rootdir=TIMIT_main_dir): inputs = [] targets = [] for dir_path, sub_dirs, files in os.walk(rootdir): for file in files: if (os.path.join(dir_path, file)).endswith('.wav'): wav_file_name = os.path.join(dir_path, file) input_data, f_s = sf.read(wav_file_name) # mfcc_feat = MFCC_input(mfcc(input_data,f_s)) mfcc_feat = mfcc(input_data, f_s) #Delta features delta_feat = mfcc_feat[:-1] - mfcc_feat[1:] #Delta-Delta features deltadelta_feat = delta_feat[:-1] - delta_feat[1:] #Removing the first two frames mfcc_feat = mfcc_feat[2:] delta_feat = delta_feat[1:] #Concatenating mfcc, delta and delta-delta features full_input = np.concatenate( (mfcc_feat, delta_feat, deltadelta_feat), axis=1) inputs.append( np.asarray(full_input, dtype=theano.config.floatX) ) #Rakeshvar wants one frame along each column but i am using Lasagne text_file_name = wav_file_name[:-4] + '.txt' target_data_file = open(text_file_name) target_data = str(target_data_file.read()).lower().translate( None, '!:,".;?') # target_data = str(target_data_file.read()).lower().translate(str.maketrans('','', '!:,".;?')) target_data = target_data[8:-1] #No '.' in lexfree dictionary targets.append(target_data) return inputs, targets
def read_data(files_amount, total_length, nc = 13, path=''): for i in range(files_amount): (rate,sig) = wav.read(path + str(i) + '.wav') mfcc_feat = mfcc(sig,rate, numcep = nc) mfcc_feat = np.reshape(mfcc_feat, (len(mfcc_feat)*nc, 1)) if any(np.isnan(mfcc_feat)) or any(np.isinf(mfcc_feat)): ind = [x for x in range(len(mfcc_feat)) if np.isnan(mfcc_feat[x]) or np.isinf(mfcc_feat[x])] for x in ind: mfcc_feat[x] = 0 if i == 0: mfcc_data = mfcc_feat if i != 0: if len(mfcc_feat) == total_length: mfcc_data = np.hstack((mfcc_data, mfcc_feat)) else: if len(mfcc_feat) > total_length: mfcc_data = np.hstack((mfcc_data, mfcc_feat[:(total_length)])) else: xx = np.vstack((mfcc_feat,np.reshape(np.asarray([0] * (total_length-len(mfcc_feat))), (total_length-len(mfcc_feat),1) ))) mfcc_data = np.hstack((mfcc_data, xx)) return mfcc_data
def crossover(playlist_1, playlist_2, playlist_size): #Crosses over playlists global all_playlistsl #print "Playlist_size: ", playlist_size one = all_playlists[playlist_1] two = all_playlists[playlist_2] child = Playlist(playlist_size) one_percentage = (one.fitness / float(one.fitness + two.fitness)) #print "One %: ", one_percentage one_genes = int(floor(playlist_size * one_percentage)) print "One genes: ", one_genes one_copy = copy.deepcopy(one) two_copy = copy.deepcopy(two) #Get genes from first parent for i in range(one_genes): #if len(one_copy.songs) <= 1: # two_genes+=1 # break all_songs.append(file) woteva = mfcc(sig, rate) woteva = reduce_matrix(woteva, 500) return woteva
def make_mean_mfcc(filename): try: (rate, sig) = wav.read(filename) mfcc_feat = mfcc(sig, rate) avg_mfcc = np.mean(mfcc_feat, axis = 0) return avg_mfcc except: pass
def predict(self, signal, fs = 44100): if len(signal.shape) > 1: signal = signal[:, 0] signal_new = remove_silence(fs, signal) # if len(signal_new) < len(signal) / 4: # return "Silence" mfcc_vecs = mfcc(signal_new, fs, numcep = 15) return self.predict_feat(mfcc_vecs)
def readSegFeat(start_t, end_t, signal, sr): try: sig = signal[int(sr * start_t):int(sr * end_t)] except: sig = signal[int(sr * start_t):-1] cleansig = remove_silence(sr, sig) mfcc_vecs = mfcc(cleansig, sr, numcep=15) return mfcc_vecs
def extract_feats(signal, sr): feats = mfcc(signal, sr) #fbank_feat = logfbank(signal, sr, nfilt = 17) #feats = np.hstack((mfcc_feat, fbank_feat)) mu = np.mean(feats, axis = 0) sigma = np.std(feats, axis = 0) feature = (feats - mu) / sigma return feature
def predict(self, signal, fs=44100): if len(signal.shape) > 1: signal = signal[:, 0] signal_new = remove_silence(fs, signal) # if len(signal_new) < len(signal) / 4: # return "Silence" mfcc_vecs = mfcc(signal_new, fs, numcep=15) return self.predict_feat(mfcc_vecs)
def readSegFeat(start_t, end_t, signal, sr): try: sig = signal[int(sr * start_t) : int(sr * end_t)] except: sig = signal[int(sr * start_t) : -1] cleansig = remove_silence(sr, sig) mfcc_vecs = mfcc(cleansig, sr, numcep = 15) return mfcc_vecs
def addMFCC(data_dict): for name in data_dict: data_dict[name]['mfcc'] =[] audio_path = data_dict[name]['raw'] sr, WAV = wav.read(audio_path) MFCC = mfcc(WAV, sr) data_dict[name]['mfcc'].append(MFCC) return data_dict
def extract_mfcc(): #nadi sve .wav fileove u audio direktoriju audioDir = "audio/" if not os.path.exists(audioDir): os.makedirs(audioDir) print "Ne postoji audio direktorij!" audioFiles = [] for file in os.listdir(audioDir): if fnmatch.fnmatch(file, '*.wav'): audioFiles.append(file) #ispis broja pronadenih .wav datoteka u audio direktoriju print "" print "Pronasao sam %d audio zapisa u %s direktoriju!" % (len(audioFiles), audioDir) print "" #petlja koja prolazi kroz sve .wav datoteke unutar audio direktorija for x in range(0, len(audioFiles)): #ime .wav audio zapisa, ime .png grafa, ime .txt formata audio zapisa, ime direktorija za pohranu svega filename = audioFiles[x] floatFile = filename.split(".")[0] + ".txt" directory = audioDir + filename.split(".")[0] #provjere dali postoji datoteka ili direktoriji if not os.path.isfile(audioDir + filename): sys.exit("File does not exist!") if not os.path.exists(directory): os.makedirs(directory) if os.path.exists(directory): shutil.rmtree(directory) os.makedirs(directory) #citanje wav datoteke i slanje za izracun mfcc (rate, sig) = wav.read(audioDir + filename) mfcc_feat = mfcc(sig, rate, winlen=0.025, winstep=0.01, numcep=13, preemph=0.99) #ispis imena izracunatih .wav datoteka print "MFCC karakteristike izracunate za %s!" % filename #spremanje signala u float formatu i mfcc znacajki u .txt datoteke np.savetxt(directory + "/" + floatFile, sig, fmt="%.4f") np.savetxt(directory + "/mfcc_features.txt", mfcc_feat, fmt="%.16f", delimiter=",") #ispis broja .wav datoteka iz kojih smo izvukli MFCC print "Ukupno %d MFCC karakteristika izracunato!" % len(audioFiles) return 1
def feature_extract(wav_name, winlen=0.025, winstep=0.01): """This function returns (mfcc) feature vectors extracted from wav_name""" rate, signal = wav.read(wav_name) signal = numpy.sum(signal, axis=1) / signal.shape[1] signal = sigproc.framesig(signal, rate * winlen, rate * winstep) signal = vad.vad_filter(signal) signal = sigproc.deframesig(signal, 0, rate * winlen, rate * winstep) mfcc_feat = mfcc(signal, rate) return mfcc_feat
def source_save(self): for key, value in self.train_file_map.items(): wav = wave.open(key) rate = wav.getframerate() #Binary file needs to be munged because it is 2 channel #encoding is 24bit signed integer Pulse Code Modulation (PCM) #with a 44.1kHz sampling #The initial way I was hoping to munge the raw byte data #nframes = wav.getnframes() #buf = wav.readframes(nframes) #data is 24 bits in 3 bytes. np.int24 does not exist! #dt = np.dtype(np.int24) #data is in little endian format #dt = dt.newbyteorder('<') #sig = np.frombuffer(buf,dtype=dt) #numpy doesn't support int24 yet so had to use this: #http://stackoverflow.com/questions/12080279/how-do-i-create-a-numpy-dtype-that-includes-24-bit-integers rawdatamap = np.memmap(key, dtype=np.dtype('u1'), mode='r') usablebytes = rawdatamap.shape[0] - rawdatamap.shape[0] % 12 frames = int(usablebytes / 12) rawbytes = rawdatamap[:usablebytes] #This line is the difficult part which required stackoverflow, it makes the data into 32bit data, #but because it is actually 24bit data there is included redundant data in the first byte. realdata = as_strided(rawbytes.view(np.int32), strides=( 12, 3, ), shape=(frames, 2)) #This ANDs the bits by a byte mask of the last 24bits, to get rid of the redundant data sig = realdata & 0x00ffffff #mfcc is mel frequency cepstral coefficent #http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/ #mfcc_feat needs to be stored in MongoDB, it is a numpy array that is 5999 in length #Each Audio file is a scene which is being classified, one of feature vectors #used to classfy the scene is #the mfcc_feat array #mfcc will return an array of that is 5999 rows by 13 columns #Each column is a feature vector for training the classifier for that audio sample's #class (i.e. tram, park) #The window length for analysis is 0.025 seconds, #the window step between windows is 0.01 seconds. #This is the entire array of feature vectors for each audio sample. #Additional feature vectors might be added later but this is good for inital tests. mfcc_feat = mfcc(sig, samplerate=rate) #Insert records into mongodb self.insert_mongo(self.mfcc_fv, mfcc_feat, key, value)
def _extract_mfcc(filename): """Extracts mfccs from wav files""" savename = filename[0:len(filename) - 4] + '.mfc' samp_rate, X = read(filename) # ceps, mspec, spec = mfcc(X) ceps = feat.mfcc(X, samp_rate) num_ceps = ceps.shape[0] x = np.mean(ceps[int(num_ceps * 1 / 10):int(num_ceps * 9 / 10)], axis=0) np.save(savename, x)
def start(seed): #Prepare global variables and the seed_mfcc (rate, sig) = wav.read(seed) for file in glob.glob("*.wav"): all_songs.append(file) woteva = mfcc(sig, rate) woteva = reduce_matrix(woteva, 500) return woteva
def learn(wav_filename, old_data=None): rate, signal = wav.read(wav_filename, 'r') mfcc_feat = mfcc(signal, rate) if not old_data == None: mfcc_feat = np.concatenate((mfcc_feat, old_data)) gmm = mixture.GMM(GMM_CLUSTERS) gmm.fit(mfcc_feat) return gmm, mfcc_feat
def plot_bic(wavfilename, sadfilename): sample_rate, wav = wavfile.read(wavfilename) mfcc_feat = features.mfcc(wav, sample_rate) ref = getsad_ref(sadfilename) an_win_mfcc = mfcc_cut_vad_an_win(mfcc_feat, ref) [time, bic_value] = bic(an_win_mfcc) pyplot.plot(time, bic_value) pyplot.scatter(time, bic_value) pyplot.show()
def generate_speech(addr): try: (rate,sig) = wav.read(addr) #plot_graf(sig, rate) mfcc_feat = mfcc(sig,rate, highfreq=4000, numcep=20) return lbg.generate_codebook(mfcc_feat, 16)[0] except ValueError: print("ValueErroe: Not a WAV file \nExit.") return -1
def get_binned_features(self): binned_features = {} for i in range(len(self.binned_signals)): binned_features[i] = features.mfcc(self.binned_signals[i], self.sample_rate, winlen=self.feature_winlen, numcep=self.feature_numcep, nfilt=self.feature_nfilt) return binned_features
def get_MFCC_feature(sig, rate): p_array = mfcc(sig, rate, winlen=0.025, winstep=0.01) # 获取梅尔倒谱系数 col_num = p_array.shape[1] feature_array = [] for ii in range(col_num): # test1. 取某一维的标准差 # feature_array.append(np.std(p_array[:,ii])) feature_array.append(p_array[:, ii]) return feature_array
def feature_extract(wav_name, winlen=0.025, winstep=0.01): """This function returns (mfcc) feature vectors extracted from wav_name""" rate, signal = wav.read(wav_name) signal = numpy.sum(signal, axis=1)/signal.shape[1] signal = sigproc.framesig(signal, rate*winlen, rate*winstep) signal = vad.vad_filter(signal) signal = sigproc.deframesig(signal, 0, rate*winlen, rate*winstep) mfcc_feat = mfcc(signal, rate) return mfcc_feat
def MFCC(data, samp): mfcc_feat = mfcc(data, samp) mMin = mfcc_feat.min() mMax = mfcc_feat.max() mfcc_feat -= mMin mfcc_feat *= 255 / mfcc_feat.max() outImg = np.array(mfcc_feat, np.uint8) return outImg
def generate(self,testsample): (rate,audio) = wav.read(testsample.path) # grab first channel one_channel = _extract_single_channel(audio) N = len(audio) mfcc_feat = mfcc(one_channel,rate) cols=mfcc_feat.shape[0]*mfcc_feat.shape[1] return mfcc_feat.reshape((1,cols))[0]
def MFCC(data, samp): mfcc_feat = mfcc(data,samp) mMin = mfcc_feat.min() mMax = mfcc_feat.max() mfcc_feat -= mMin mfcc_feat *= 255/mfcc_feat.max() outImg = np.array(mfcc_feat, np.uint8) return outImg
def build_models(input_folder): # Initialize the variable to store all the models speech_models = [] # Parse the input directory for dirname in os.listdir(input_folder): # Get the name of the subfolder subfolder = os.path.join(input_folder, dirname) if not os.path.isdir(subfolder): continue # Extract the label label = subfolder[subfolder.rfind('/') + 1:] # Initialize the variable to store the training data X = np.array([]) # Create a list of files to be used for training # We will leave one file per folder for testing training_files = [ x for x in os.listdir(subfolder) if x.endswith('.wav') ][:-1] # Iterate through the training files and build the models for filename in training_files: # Extract the current file path filepath = os.path.join(subfolder, filename) # Read the audio signal from the input file sampling_freq, signal = wavfile.read(filepath) # Extract the MFCC features with warnings.catch_warnings(): warnings.simplefilter('ignore') features_mfcc = mfcc(signal, sampling_freq) # Append to the variable X if len(X) == 0: X = features_mfcc else: X = np.append(X, features_mfcc, axis=0) # Create the HMM model model = ModelHMM() # Train the HMM model model.train(X) # Save the model for the current word speech_models.append((model, label)) # Reset the variable model = None return speech_models
def predict(self, soundclip): """ Recognizes the speaker in the sound clip. """ sample_rate, data = wavfile.read(os.path.abspath(soundclip)) ceps = mfcc(data, sample_rate) fvec = self._mfcc_to_fvec(ceps) speaker_id = self.recognizer.predict(fvec)[0] return self.spkr_iton[speaker_id]