def convert_inputs_to_ctc_format(audio, fs, target_text): #print('convert_inputs_to_ctc_format target_text:' + target_text) inputs = mfcc(audio, samplerate=fs, numcep=num_features) # Transform in 3D array train_inputs = np.asarray(inputs[np.newaxis, :]) train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs) train_seq_len = [train_inputs.shape[1]] # Get only the words between [a-z] and replace period for none original = ' '.join(target_text.strip().lower().split(' ')).\ replace('.', '').\ replace('?', '').\ replace(',', '').\ replace("'", '').\ replace('!', '').\ replace('-', '') #print('original:' + original) targets = original.replace(' ', ' ') targets = targets.split(' ') # Adding blank label targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets]) # Transform char into index targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX for x in targets]) # Creating sparse representation to feed the placeholder train_targets = sparse_tuple_from([targets]) return train_inputs, train_targets, train_seq_len, original
def audiofile_to_input_vector(audio_filename, n_input, n_context): """ 将音频装换成MFCC :param audio_filename: :param n_input: :param n_context: :return: """ # 加载wav文件 fs, audio = wav.read(audio_filename) # 获取mfcc数值 orig_inputs = mfcc(audio, samplerate=fs, numcep=n_input) # print(np.shape(orig_inputs)) #(277, 26) orig_inputs = orig_inputs[::2] # (139, 26) 每隔一行进行一次取样 # train_inputs = np.array([], np.float32) # print(orig_inputs.shape[0]) train_inputs = np.zeros((orig_inputs.shape[0], n_input + 2 * n_input * n_context)) # print(np.shape(train_inputs))#)(139, 494) # empty_mfcc = np.array([]) empty_mfcc = np.zeros((n_input)) # 准备输入数据,数据由三部分安顺序拼接而成,分为当前样本的前9个序列样本,当前样本序列,后9个序列样本 time_slices = range(train_inputs.shape[0]) # 139个切片 context_past_min = time_slices[0] + n_context context_future_max = time_slices[-1] - n_context # [9,1,2...,137,129] for time_slice in time_slices: # 前9个补0,mfcc features need_empty_past = max(0, (context_past_min - time_slice)) empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past)) data_source_past = orig_inputs[max(0, time_slice - n_context):time_slice] # 后9个补0,mfcc features need_empty_future = max(0, (time_slice - context_future_max)) empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future)) data_source_future = orig_inputs[time_slice + 1:time_slice + n_context + 1] if need_empty_past: past = np.concatenate((empty_source_past, data_source_past)) else: past = data_source_past if need_empty_future: future = np.concatenate((data_source_future, empty_source_future)) else: future = data_source_future past = np.reshape(past, n_context * n_input) now = orig_inputs[time_slice] future = np.reshape(future, n_context * n_input) # 234, 26, 234 # train_data = np.concatenate((past, now, future)); train_inputs[time_slice] = np.concatenate((past, now, future)) # 将数据使用正太分布标准化,减去均值然后再除以方差 train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs) return train_inputs
def featurize(self, audio_clip): """ For a given audio clip, calculate the corresponding feature Params: audio_clip (str): Path to the audio clip """ if self.spectrogram: return spectrogram_from_file( audio_clip, step=self.step, window=self.window, max_freq=self.max_freq) else: (rate, sig) = wav.read(audio_clip) return mfcc(sig, rate, numcep=self.mfcc_dim)
def getMEL(data, n_mfcc = 12, invCoeffOrder = False, winsize = 20, frames = 64): """ Function that goes through all samples of each syllable and extracts the mfccs for the 12 mel frequency channels. :param data: list of syllables with sample data :param n_mfcc: number of mel frequency cepstral coefficients to return (default = 12) :param invCoeffOrder: if True, extract last n mfcc instead of first n (default = False) :param wisize: size of the time window used for mfcc extraction :param frames: desired number of time frames in final mfcc data :returns syllables: list with mfccs for n_mfcc mel channels for each sample of each syllable """ syllables = [] i = 0 for syllable in data: samples = [] for sample in syllable: W = winsize/1000. * sample[1] winstep = (np.round(1 + (len(sample[0]) - W) / (frames - 1))) / float(sample[1]) i += 1 if invCoeffOrder: samples.append( mfcc(sample[0], samplerate = sample[1], winlen = winsize/1000., winstep = winstep, numcep = n_mfcc )[:,-n_mfcc::]) else: samples.append( mfcc(sample[0], samplerate = sample[1], winlen = winsize/1000., winstep = winstep, numcep = n_mfcc + 1 )[:,1::]) syllables.append(samples) return syllables
def audiofile_to_input_vector(audio_filename, numcep, numcontext): r""" Given a WAV audio file at ``audio_filename``, calculates ``numcep`` MFCC features at every 0.01s time step with a window length of 0.025s. Appends ``numcontext`` context frames to the left and right of each time step, and returns this data in a numpy array. """ # Load wav files fs, audio = wav.read(audio_filename) # Get mfcc coefficients features = mfcc(audio, samplerate=fs, numcep=numcep, winlen=0.032, winstep=0.02, winfunc=np.hamming) # Add empty initial and final contexts empty_context = np.zeros((numcontext, numcep), dtype=features.dtype) features = np.concatenate((empty_context, features, empty_context)) return features
def extract_features_and_targets(wav_file, txt_file): """ Extract MFCC features from an audio file and target character annotations from a corresponding text transcription Args: wav_file: audio wav file txt_file: text file with transcription Returns: features, targets, sequence length, original text transcription """ fs, audio = wav.read(wav_file) features = mfcc(audio, samplerate=fs, lowfreq=50) mean_scale = np.mean(features, axis=0) std_scale = np.std(features, axis=0) features = (features - mean_scale[np.newaxis, :]) / std_scale[np.newaxis, :] seq_len = features.shape[0] # Readings targets with open(txt_file, 'r') as f: for line in f.readlines(): if line[0] == ';': continue # Get only the words between [a-z] and replace period for none original = ' '.join(line.strip().lower().split(' ')).replace('.', '').replace("'", '').replace('-', '').replace(',','') targets = original.replace(' ', ' ') targets = targets.split(' ') # Adding blank label targets = np.hstack([Space_Token if x == '' else list(x) for x in targets]) # Transform char into index targets = np.asarray([Space_Index if x == Space_Token else ord(x) - Index_Start for x in targets]) # shape (None, num_steps, num_features) features = np.asarray(features[np.newaxis, :]) return features, targets, seq_len, original
def extract_cepstrum(df, rate, mfcc_start=0, mfcc_end=-1, winlen = 0.025, winstep = 0.01, numcep = 16, nfilt = 32, nfft=512, lowfreq = 400, highfreq = 12000, noise = None): ''' Extracts the cepstrum features from the raw signal data df : a dataframe where the indices are the timepoint for each supposed key press rate : The rate at which the sound file was processed either when call made to spl.open_audio() or scipy.io.wavfile.wav() mfcc_start/mfcc_end : indices to slice the feature vector remainder of args are passed into the mfcc punction ''' rate = float(rate) # Convert raw signal into list of numpy arrays char_data = df[df.columns[list(df.columns).index('0'):]].values if noise: char_data += np.random.normal(0, noise, char_data.shape) keypress_sigs = [np.nan_to_num(np.squeeze(l)) for l in np.split(char_data, char_data.shape[0], axis=0)] # Create the keypress features one by one keypress_feats = [] for keypress_sig in keypress_sigs: mfcc_feat = mfcc(keypress_sig, samplerate= rate, winlen=winlen, winstep=winstep, numcep=numcep, nfilt=nfilt, lowfreq=lowfreq, nfft = nfft, highfreq=highfreq) keypress_feats.append(np.concatenate(mfcc_feat[mfcc_start:mfcc_end, :]).T) # Create cepstrum dataframe cepstrum_df = pd.DataFrame(np.vstack(keypress_feats)) # Copy over true char labels if 'char' in cepstrum_df: cepstrum_df['char'] = df['char'] # Put the char labels at the front cepstrum_df = cepstrum_df.reindex(columns = [cepstrum_df.columns[-1]] + list(cepstrum_df.columns[:-1])) return cepstrum_df
def audioToInputVector(audio, fs, numcep, numcontext): if DeprecationWarning.displayed is not True: DeprecationWarning.displayed = True print('------------------------------------------------------------------------', file=sys.stderr) print('WARNING: libdeepspeech failed to load, resorting to deprecated code', file=sys.stderr) print(' Refer to README.md for instructions on installing libdeepspeech', file=sys.stderr) print('------------------------------------------------------------------------', file=sys.stderr) # Get mfcc coefficients features = mfcc(audio, samplerate=fs, numcep=numcep) # We only keep every second feature (BiRNN stride = 2) features = features[::2] # One stride per time step in the input num_strides = len(features) # Add empty initial and final contexts empty_context = np.zeros((numcontext, numcep), dtype=features.dtype) features = np.concatenate((empty_context, features, empty_context)) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2*numcontext+1 train_inputs = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, numcep), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) # Flatten the second and third dimensions train_inputs = np.reshape(train_inputs, [num_strides, -1]) # Whiten inputs (TODO: Should we whiten?) # Copy the strided array so that we can write to it safely train_inputs = np.copy(train_inputs) train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs) # Return results return train_inputs
def input_preprocess(): ###audio_filename = maybe_download('LDC93S1.wav', 93638) ###target_filename = maybe_download('LDC93S1.txt', 62) fs, audio = wav.read(audio_filename) inputs = mfcc(audio, samplerate=fs) # Tranform in 3D array train_inputs = np.asarray(inputs[np.newaxis, :]) train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs) train_seq_len = [train_inputs.shape[1]] num_examples = 1 with open(target_filename, 'r') as f: #Only the last line is necessary line = f.readlines()[-1] ## global original[num_examples] # Get only the words between [a-z] and replace period for none ###original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '') targets = original.replace(' ', ' ') targets = targets.split(' ') # Adding blank label targets = np.hstack([Space_Token if x == '' else list(x) for x in targets]) # Transform char into index targets = np.asarray([Space_Index if x == Space_Token else ord(x) - Index_Start for x in targets]) # Creating sparse representation to feed the placeholder train_targets = sparse_tuple_from([targets]) # We don't have a validation dataset :( val_inputs, val_targets, val_seq_len = train_inputs, train_targets, \ train_seq_len return inputs, train_inputs, train_targets, train_seq_len
data_lines = f.readlines() data_num = len(data_lines) training_num = int(data_num * TRAIN_FRACTION) with open(TEXT_FILE, 'r') as f: for i, line in enumerate(f): parts = line.split(',') # Get just the filename part of the audio file audio_file = parts[0] last_slash = audio_file.rfind('/') + 1 audio_file = audio_file[last_slash:-4] text = parts[1] rate, signal = wav.read(AUDIO_PATH + "/" + audio_file + ".wav") mfcc_feat = mfcc(signal, rate, numcep=26) mfcc_feat = mfcc_feat.transpose() characters = np.array([]) for character in text: characters = np.append(characters, ALPHABET.index(character)) if i < training_num: np.save(TRAIN_INPUT_PATH + "/" + audio_file, mfcc_feat) np.save(TRAIN_TARGET_PATH + "/" + audio_file, characters) else: np.save(TEST_INPUT_PATH + "/" + audio_file, mfcc_feat) np.save(TEST_TARGET_PATH + "/" + audio_file, characters) inputs, outputs = load_batched_data(TRAIN_INPUT_PATH, TRAIN_TARGET_PATH) # 8 input samples:
def extract_features(audio, rate): mfcc_feat = mfcc.mfcc(audio, rate, 0.025, 0.01, 20, appendEnergy=True) mfcc_feat = preprocessing.scale(mfcc_feat) delta = calculate_delta(mfcc_feat) combined = np.hstack((mfcc_feat, delta)) return combined
def readwav(audio_filename, n_input=26): # 读取音频文件 fs, audio = wav.read(audio_filename) # 提取mfcc数值 orig_inputs = mfcc(audio, samplerate=fs, numcep=26)
def get_speech_features(signal, sample_freq, num_features, pad_to=8, features_type='spectrogram', window_size=20e-3, window_stride=10e-3, augmentation=None): """Function to convert raw audio signal to numpy array of features. Args: signal (np.array): np.array containing raw audio signal. sample_freq (float): frames per second. num_features (int): number of speech features in frequency domain. pad_to (int): if specified, the length will be padded to become divisible by ``pad_to`` parameter. features_type (string): 'mfcc' or 'spectrogram'. window_size (float): size of analysis window in milli-seconds. window_stride (float): stride of analysis window in milli-seconds. augmentation (dict, optional): dictionary of augmentation parameters. See :func:`get_speech_features_from_file` for specification and example. Returns: np.array: np.array of audio features with shape=[num_time_steps, num_features]. audio_duration (float): duration of the signal in seconds """ if augmentation is not None: if 'time_stretch_ratio' not in augmentation: raise ValueError( 'time_stretch_ratio has to be included in augmentation ' 'when augmentation it is not None') if 'noise_level_min' not in augmentation: raise ValueError( 'noise_level_min has to be included in augmentation ' 'when augmentation it is not None') if 'noise_level_max' not in augmentation: raise ValueError( 'noise_level_max has to be included in augmentation ' 'when augmentation it is not None') signal = augment_audio_signal(signal, sample_freq, augmentation) else: signal = (normalize_signal(signal.astype(np.float32)) * 32767.0).astype(np.int16) audio_duration = len(signal) * 1.0 / sample_freq n_window_size = int(sample_freq * window_size) n_window_stride = int(sample_freq * window_stride) # making sure length of the audio is divisible by 8 (fp16 optimization) length = 1 + int( math.ceil((1.0 * signal.shape[0] - n_window_size) / n_window_stride)) if pad_to > 0: if length % pad_to != 0: pad_size = (pad_to - length % pad_to) * n_window_stride signal = np.pad(signal, (0, pad_size), mode='constant') if features_type == 'spectrogram': frames = psf.sigproc.framesig(sig=signal, frame_len=n_window_size, frame_step=n_window_stride, winfunc=np.hanning) # features = np.log1p(psf.sigproc.powspec(frames, NFFT=N_window_size)) features = psf.sigproc.logpowspec(frames, NFFT=n_window_size) assert num_features <= n_window_size // 2 + 1, \ "num_features for spectrogram should be <= (sample_freq * window_size // 2 + 1)" # cut high frequency part features = features[:, :num_features] elif features_type == 'mfcc': features = psf.mfcc(signal=signal, samplerate=sample_freq, winlen=window_size, winstep=window_stride, numcep=num_features, nfilt=2 * num_features, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=2 * num_features, appendEnergy=False) elif features_type == 'logfbank': features = psf.logfbank(signal=signal, samplerate=sample_freq, winlen=window_size, winstep=window_stride, nfilt=num_features, nfft=512, lowfreq=0, highfreq=sample_freq / 2, preemph=0.97) else: raise ValueError('Unknown features type: {}'.format(features_type)) if pad_to > 0: assert features.shape[0] % pad_to == 0 mean = np.mean(features) std_dev = np.std(features) features = (features - mean) / std_dev return features, audio_duration
x= len(data) p = 25000-x l =0 tests = np.empty([200,4043]) new_data = np.empty([25000,]) y1 = np.empty([25000,]) y = p/2; for i in range(0,y-1): new_data[i] = y1 [i] for i in range(y,25000-p+y-1): new_data[i] = data[i-y] for i in range(25000-y,24999): new_data[i] = y1[i] data1 = mfcc(new_data,samplerate) data = data1 data = data.reshape(4043,) nIn = 4043 nOut = 5 x = data def sigmoid(x): x = np.array(x,dtype=np.float128) x = x.reshape(nOut,1) x = x for i in range (0,5): if x[i] < -700: x[i]=0
def extract_mfcc(sound): (rate,sig) = wav.read(StringIO.StringIO(sound)) mfcc_feat = features.mfcc(sig,rate) return numpy.asarray(mfcc_feat, dtype='float32')
#!/usr/bin/env python from python_speech_features import mfcc from python_speech_features import delta from python_speech_features import logfbank import scipy.io.wavfile as wav (rate,sig) = wav.read("english.wav") mfcc_feat = mfcc(sig,rate) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat = logfbank(sig,rate) print(fbank_feat[1:3,:])
""" demo04_mfcc.py mfcc矩阵 """ import scipy.io.wavfile as wf import python_speech_features as sf import matplotlib.pyplot as mp sample_rate, sigs = wf.read('../ml_data/speeches/training/banana/banana01.wav') mfcc = sf.mfcc(sigs, sample_rate) mp.matshow(mfcc.T, cmap='gist_rainbow') mp.show()
def pinzhen(file_path, n_input, n_context): wav_data, fs = read_wav_data(file_path) origin_inputs = mfcc(wav_data, samplerate=fs, numcep=n_input) ''' 跳帧选择需要的特征; ''' # print(origin_inputs) ''' 这里涉及跳帧处理,隔一帧,取一列特征; ''' # origin_inputs = origin_inputs[::2] # print(origin_inputs) ''' 初始化最后提取的总特征维度; ''' train_inputs = np.zeros(shape=(origin_inputs.shape[0], n_input + 2 * n_input * n_context)) ''' 初始化需要填补的MFCC特征; ''' empty_mfcc = np.zeros((n_input)) time_slices = range(train_inputs.shape[0]) ''' 设置初始需要拼帧与未来需要拼帧的初始位置; ''' context_past_min = time_slices[0] + n_context context_future_max = time_slices[-1] - n_context for time_slice in time_slices: need_empty_past = max(0, (context_past_min - time_slice)) empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past)) data_source_past = origin_inputs[max(0, time_slice - n_context):time_slice] need_empty_future = max(0, (time_slice - context_future_max)) empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future)) data_source_future = origin_inputs[time_slice + 1:time_slice + n_context + 1] if need_empty_past: past = np.concatenate((empty_source_past, data_source_past)) else: past = data_source_past if need_empty_future: future = np.concatenate((data_source_future, empty_source_future)) else: future = data_source_future past = np.reshape(past, n_context * n_input) now = origin_inputs[time_slice] future = np.reshape(future, n_context * n_input) train_inputs[time_slice] = np.concatenate((past, now, future)) ''' 可以做一下均值归一化,将数据服从正太分布标准,减去均值再除以方差 ''' train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs) return train_inputs
def makeData(from_file, to_file, core): # samples = np.zeros((0, num_cols)) for counter, wp in enumerate(wavpaths[from_file:to_file + 1], 1): rate, sig = scipy.io.wavfile.read(wp) np_mfcc = python_speech_features.mfcc(sig, rate, winlen=window, winstep=step) np_mfcc_d = python_speech_features.delta(np_mfcc, 2) np_mfcc_dd = python_speech_features.delta(np_mfcc_d, 2) np_mfcc_all = np.append(np.append(np_mfcc, np_mfcc_d, axis=1), np_mfcc_dd, axis=1) # print(np_mfcc_all.shape) wn = wp.split("/")[-1] # get corpus info if wn[0] in ["F", "M"]: corpus = "ifa" elif wn[0] == "D": corpus = "ifadv" elif wn[0] == "p": corpus = "ecsd" else: corpus = "cgn-" + wn[0] # sent_id = ".".join(wn.split(".")[:-1]) print(core, counter, "/", to_file - from_file, sent_id) tg_path = af_path + chunk_folder + sent_id + ".TextGrid" tg = textgrid.TextGrid() with makeTempFile(tg_path) as tempf: tg.read(tempf.name) intervals = tg.tiers[0].intervals end_time = round(intervals[-1].maxTime, 3) start_time = round(intervals[0].minTime, 3) # print(start_time, end_time) classes = np.zeros((0, num_cols_per_frame)) int_i = 0 num_frames = np_mfcc_all.shape[0] useable_frame_indices = [] for frame in range(1, num_frames + 1): frame_s = round(start_time + (frame - 1) * step, 3) frame_e = frame_s + window if frame_e > end_time: # because '0' samples can be appended to sig so it can be divided by an integer of frames frame_e = end_time intvl = intervals[int_i] if frame_s < round(intvl.minTime, 3): print(frame_s, round(intvl.minTime, 3)) assert frame_s >= round(intvl.minTime, 3) if frame_e <= round(intvl.maxTime, 3): # calculate the proportion of the frame that is within the useable centre of the interval int_dur = round(intvl.duration(), 3) prop_dur = int_dur * prop_used used_s = round(intvl.minTime, 3) + (int_dur - prop_dur) / 2 used_e = used_s + prop_dur x1 = used_s - frame_s x1 = 0 if x1 <= 0 else window if x1 > window else x1 x2 = frame_e - used_e x2 = 0 if x2 <= 0 else window if x2 > window else x2 prop_f_in_used_i = (window - (x1 + x2)) / window # print(prop_f_in_used_i, int_dur, used_s, used_e, frame_s, frame_e) if prop_f_in_used_i > 0.5: useable_frame_indices.append(frame - 1) label_list = getFeatureLabel(intvl.mark) else: label_list = [99 for i in range(len(features))] row = np.array([np.append(np_mfcc_all[frame - 1, ], label_list)]) classes = np.append(classes, row, axis=0) else: assert frame_e > round(intvl.maxTime, 3) proportions = [(round(intvl.maxTime, 3) - frame_s, int_i)] new_int = intvl new_int_i = int_i next_int_i = int_i while frame_e > round(new_int.maxTime, 3): new_int_i += 1 new_int = intervals[new_int_i] overlap = (frame_e - round(new_int.minTime, 3)) if frame_e <= round(new_int.maxTime, 3) else (round(new_int.maxTime, 3) - round(new_int.minTime, 3)) proportions.append((overlap, new_int_i)) if (frame_s + step) >= round(new_int.minTime, 3): next_int_i = new_int_i best_int_i = max(proportions)[1] best_int = intervals[best_int_i] # calculate the proportion of the frame that is within the useable centre of the interval int_dur = round(best_int.duration(), 3) prop_dur = int_dur * prop_used used_s = round(best_int.minTime, 3) + (int_dur - prop_dur) / 2 used_e = used_s + prop_dur x1 = used_s - frame_s x1 = 0 if x1 <= 0 else window if x1 > window else x1 x2 = frame_e - used_e x2 = 0 if x2 <= 0 else window if x2 > window else x2 prop_f_in_used_i = (window - (x1 + x2)) / window if prop_f_in_used_i > 0.5: useable_frame_indices.append(frame - 1) label_list = getFeatureLabel(best_int.mark) else: label_list = [99 for i in range(len(features))] row = np.array([np.append(np_mfcc_all[frame - 1, ], label_list)]) classes = np.append(classes, row, axis=0) int_i = next_int_i # print(useable_frame_indices) for old_row in range(classes.shape[0]): if (old_row >= 2 * frame_window) and ((old_row - frame_window) in useable_frame_indices): new_labels = classes[old_row - frame_window, num_cols_per_frame - len(features):] if new_labels[0] < 90: new_feat = classes[old_row - (2 * frame_window):old_row + 1, :num_cols_per_frame - len(features)].flatten() new_row = np.array([np.append(np.append(new_feat, corpora[corpus]), new_labels)]) # samples = np.append(samples, new_row, axis=0) with open(af_path + "AF_s" + str(int(core) + running_cores) + ".csv", "a") as f: # with open(scratch + "AF_s" + core + ".csv", "a") as f: np.savetxt(f, new_row, fmt='%.5e', delimiter=",")
def evaluate(self, opt, videofile): self.__S__.eval() # ========== ========== # Load video # ========== ========== cap = cv2.VideoCapture(videofile) frame_num = 1 images = [] while frame_num: frame_num += 1 ret, image = cap.read() if ret == 0: break images.append(image) im = numpy.stack(images, axis=3) im = numpy.expand_dims(im, axis=0) im = numpy.transpose(im, (0, 3, 4, 1, 2)) imtv = torch.autograd.Variable( torch.from_numpy(im.astype(float)).float()) # ========== ========== # Load audio # ========== ========== audiotmp = os.path.join(opt.tmp_dir, 'audio.wav') command = ( "ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile, audiotmp)) output = subprocess.call(command, shell=True, stdout=None) sample_rate, audio = wavfile.read(audiotmp) mfcc = zip(*python_speech_features.mfcc(audio, sample_rate)) mfcc = numpy.stack([numpy.array(i) for i in mfcc]) cc = numpy.expand_dims(numpy.expand_dims(mfcc, axis=0), axis=0) cct = torch.autograd.Variable( torch.from_numpy(cc.astype(float)).float()) # ========== ========== # Check audio and video input length # ========== ========== if (float(len(audio)) / 16000) < (float(len(images)) / 25): print( " *** WARNING: The audio (%.4fs) is shorter than the video (%.4fs). Type 'cont' to continue. *** " % (float(len(audio)) / 16000, float(len(images)) / 25)) pdb.set_trace() # ========== ========== # Generate video and audio feats # ========== ========== lastframe = len(images) - 6 im_feat = [] cc_feat = [] tS = time.time() for i in range(0, lastframe, opt.batch_size): im_batch = [ imtv[:, :, vframe:vframe + 5, :, :] for vframe in range(i, min(lastframe, i + opt.batch_size)) ] im_in = torch.cat(im_batch, 0) im_out = self.__S__.forward_lip(im_in.cuda()) im_feat.append(im_out.data.cpu()) cc_batch = [ cct[:, :, :, vframe * 4:vframe * 4 + 20] for vframe in range(i, min(lastframe, i + opt.batch_size)) ] cc_in = torch.cat(cc_batch, 0) cc_out = self.__S__.forward_aud(cc_in.cuda()) cc_feat.append(cc_out.data.cpu()) im_feat = torch.cat(im_feat, 0) cc_feat = torch.cat(cc_feat, 0) # ========== ========== # Compute offset # ========== ========== print('Compute time %.3f sec.' % (time.time() - tS)) dists = calc_pdist(im_feat, cc_feat, vshift=opt.vshift) mdist = torch.mean(torch.stack(dists, 1), 1) minval, minidx = torch.min(mdist, 0) offset = opt.vshift - minidx conf = torch.median(mdist) - minval fdist = numpy.stack([dist[minidx].numpy() for dist in dists]) # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15) fconf = torch.median(mdist).numpy() - fdist fconfm = signal.medfilt(fconf, kernel_size=9) numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format}) print('Framewise conf: ') print(fconfm) print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset, minval, conf)) dists_npy = numpy.array([dist.numpy() for dist in dists]) return offset.numpy(), conf.numpy(), dists_npy
#!/usr/bin/env python import scipy.io.wavfile as w from python_speech_features import mfcc from python_speech_features import delta from python_speech_features import logfbank # Load waveform audio_name = '/home/alanwuha/Documents/Projects/datasets/iemocap/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F000.wav' sample_rate, waveform = w.read(audio_name) # Compute MFCC mfcc_feat = mfcc(waveform, sample_rate, preemph=0) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat = logfbank(waveform, sample_rate) print(fbank_feat[1:3, :])
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Mar 8 09:53:24 2019 @author: mkumar """ from python_speech_features import mfcc sample_rate, X = wavfile.read('./genres/rock/rock.00025.wav') print(sample_rate, X.shape) specgram(X, Fs=sample_rate, xextent=(0, 30)) ceps = mfcc(X) print(ceps.shape)
25000, ]) y1 = np.empty([ 25000, ]) y = p // 2 for i in range(0, y): new_data[i] = y1[i] for i in range(y, x + y): new_data[i] = data[i - y] for i in range(x + y, 25000): new_data[i] = y1[i] data = (mfcc(y=new_data, sr=samplerate, n_mfcc=39).T) data = data.reshape((1, data.shape[0], data.shape[1])) print(data.shape) nIn = 4043 nOut = 5 ######################################################################################## # Def Loss function # this is the implementation of categorical loss function from scratch # the input to the function is predicted probability and a one hot vector # it computes the loss by summing over the expression -ylog(p) over the tuple # this is summed over a batch and the final loss is given def categorical_cross_entropy(ytrue, ypred, axis=-1):
def calculates_mfcc(data): return mfcc(data, samplerate=SAMPLE_RATE, winlen=0.02, winstep=0.01)
print(video) video.download('./games') fn = './games/game_'+str(i) +'.mp4' new_fn = 'game_'+str(i)+'.wav' # get .wav os.system(cmd.format(fn,'./games_audio/' +new_fn)) # delete video file os.system(rm.format('./games/game_'+str(i) +'.mp4')) # read wav fs,x = wav.read('./games_audio/' +new_fn) #mfcc coefs mel= mfcc(x[:,0],fs) #save mfcc np.save('./games_audio/game_'+str(i)+'.npy' , mel.astype(np.float32)) #remove .wav os.system(rm.format( './games_audio/' +new_fn ) ) j+=1 except: pass
def mfccs(self): """Returns the Mel-Frequency Cepstral Coefficients for this segment.""" return mfcc(self.signal[:int(0.6 * self.sample_rate)], self.sample_rate, winlen=0.05, winstep=0.05, numcep=40, nfilt=80)
# -*- coding: utf-8 -*- from scipy.io import wavfile from python_speech_features import mfcc # 1.读取wav格式数据 path = './music/data/1KHz-STERO.wav' (rate,data) = wavfile.read(path) print("文件的rate值:{}",format(rate)) print("文件的数据的大小:{}",format(data.shape)) print(data[:10]) print('-'*100) # 提取特征信息 mfcc_feat = mfcc(signal=data,samplerate=rate,numcep=26,nfft=2048) print(type(mfcc_feat)) print(mfcc_feat) #
def mffcRead(str): (rate, sig) = wav.read(str) mfcc_feat = mfcc(sig, rate) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat = logfbank(sig, rate) return fbank_feat
def extract_ratio(train_ratio, test_ratio, audio_dir): """ Extract audio in a ratio from the directory for training and testing returns two numpy array Xtrain and Xtest audio_dir: should be of the for : "wav/*/*.wav" """ if test_ratio + train_ratio != 1: print("ratios should add up to 1\n") return all_music_files = glob.glob(audio_dir) all_music_files.sort() all_mfcc = np.array([]) flag = True Testing = False Training = True #initialize the array all_mfcc = np.array([]) count = 0; #training #count = test_ratio; loop_count = -1 flag = True for train_test_loop in range(2): #extract mfcc features for the audio files for file_name in all_music_files: #for training select only train_ratio songs from each if Training: loop_count += 1 if loop_count % 100 == 0: count = 0 if count == train_ratio * 100: continue #selects only train_ratio songs in every 100 songs count += 1 #for testing select last test_ratio songs from each genre if Testing: loop_count += 1 if (loop_count + (test_ratio * 100)) % 100 == 0 and loop_count: count = 0 print('--'*10) if count == test_ratio * 100: continue count += 1 if Training or Testing: (rate, data) = scipy.io.wavfile.read(file_name) mfcc_feat = mfcc(data,rate) #redusing mfcc dimension to 104 mm = np.transpose(mfcc_feat) mf = np.mean(mm,axis=1) cf = np.cov(mm) ff=mf #ff is a vector of size 104 for i in range(mm.shape[0]): ff = np.append(ff,np.diag(cf,i)) #re initializing to size 104 if flag: all_mfcc = ff; print('*'*20) flag = False else: all_mfcc = np.vstack([all_mfcc,ff]) print("loooping----",loop_count) print("all_mfcc.shape:",all_mfcc.shape) if train_test_loop == 0: print('\n'*10,'====Collected Training data===','\n'*20) print('\n','====Collecting Testing data===','\n') Xtrain = all_mfcc count = test_ratio * 100 Testing = True Training = False loop_count = -1 all_mfcc = np.array([]) flag = True print("Xtrain.shape:", Xtrain.shape) if train_test_loop == 1: print('\n','====Collected Testing data===','\n') Xtest = all_mfcc print("Xtest.shape:", Xtest.shape) return Xtrain, Xtest
from python_speech_features import mfcc from python_speech_features import delta from python_speech_features import logfbank import scipy.io.wavfile as wav (rate, sig) = wav.read("Rak_train.wav") mfcc_feat = mfcc(sig, rate) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat1 = logfbank(sig, rate) fbank_feat1 = fbank_feat1.sum() print(fbank_feat1) (rate, sig) = wav.read("Qwer2.wav") mfcc_feat = mfcc(sig, rate) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat2 = logfbank(sig, rate) fbank_feat2 = fbank_feat2.sum() print(fbank_feat2) if fbank_feat2 > fbank_feat1: likely = (fbank_feat1 / fbank_feat2) * 100 print(likely) else: likely = (fbank_feat2 / fbank_feat1) * 100 print(likely)
def feature_extractor(sound_path): sampling_freq, audio = wavfile.read(sound_path) mfcc_features = mfcc(audio, sampling_freq, nfft=2048, numcep=13, nfilt=13) return mfcc_features
#!/usr/bin/env python3 from python_speech_features import mfcc from python_speech_features import delta from python_speech_features import logfbank import scipy.io.wavfile as wav import numpy as np import sys # from keras.layers import GaussianNoise # GUARD: Check if someone has supplied an argument if len(sys.argv) < 2: raise Exception('No input file') # Read an input file from the first argument on the commandline inputFile = sys.argv[1] # Calculate MFCC (rate, sig) = wav.read(inputFile) mfcc_features = mfcc(sig, rate, nfft=2048) # d_mfcc_feat = delta(mfcc_feat, 2) # fbank_feat = logfbank(sig,rate) # Normalize MFCC by subtracting the mean and using standard deviation # In the future, we should possibly do this only with the training data # Print MFCC print(mfcc_features) sys.stdout.flush()
# Name of the model (for saving and logs) PREMODELNAME = "rnn_full_mfcc+chroma+time+spec_nopreemph_mixednoise_resnet_ws08_512" os.chdir(PATH_SOURCE) print("Generating features from validationsamples ...") for aud in tqdm(glob.glob("*.wav")): [Fs, x] = audioBasicIO.read_audio_file(aud) F, f_names = frequencyandchromafeatures.feature_extraction( x, Fs, WINDOW_SIZE * Fs, WINDOW_STEP * Fs) (rate, sig) = wav.read(aud) mfcc_feat = mfcc(sig, rate, numcep=NUMCEP, nfilt=NUMFILT, winlen=WINDOW_SIZE, winstep=WINDOW_STEP, nfft=NFFT, preemph=PREEMPH) emotion = "N" if "W" in aud: emotion = "W" elif "L" in aud: emotion = "L" elif "E" in aud: emotion = "E" elif "A" in aud: emotion = "A" elif "F" in aud: emotion = "F" elif "T" in aud:
y_test.append(dataset[x][2]) # measuring run time start_time = time.time() directory = "C:/Users/rezaa/OneDrive/Desktop/Auburn Spring 2021/Machine Learning/Final Project/genres/" f = open("my.dat", 'wb') i = 0 for folder in os.listdir(directory): i += 1 if i == 11: break for file in os.listdir(directory + folder): (rate, sig) = wav.read(directory + folder + "/" + file) mfcc_feat = mfcc(sig, rate, winlen=0.020, appendEnergy=False) covariance = np.cov(np.matrix.transpose(mfcc_feat)) mean_matrix = mfcc_feat.mean(0) covariance_mean = covariance.mean(0) feature = (mean_matrix, covariance_mean, i) pickle.dump(feature, f) f.close() dataset = [] X_train = [] y_train = [] X_test = [] y_test = [] loadDataset("my.dat", 0.8, X_train, y_train, X_test, y_test) X_train_np = np.asarray(X_train)
def audiofile_to_input_vector(audio_filename, numcep, numcontext): # Load wav files fs, audio = wav.read(audio_filename) # Get mfcc coefficients orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep) #print(np.shape(orig_inputs))#(277, 26) orig_inputs = orig_inputs[::2] #(139, 26) train_inputs = np.array([], np.float32) train_inputs.resize( (orig_inputs.shape[0], numcep + 2 * numcep * numcontext)) #print(np.shape(train_inputs))#)(139, 494) # Prepare pre-fix post fix context empty_mfcc = np.array([]) empty_mfcc.resize((numcep)) # Prepare train_inputs with past and future contexts time_slices = range(train_inputs.shape[0]) #139个切片 context_past_min = time_slices[0] + numcontext context_future_max = time_slices[-1] - numcontext #[9,1,2...,137,129] for time_slice in time_slices: # 前9个补0,mfcc features need_empty_past = max(0, (context_past_min - time_slice)) empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past)) data_source_past = orig_inputs[max(0, time_slice - numcontext):time_slice] assert (len(empty_source_past) + len(data_source_past) == numcontext) # 后9个补0,mfcc features need_empty_future = max(0, (time_slice - context_future_max)) empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future)) data_source_future = orig_inputs[time_slice + 1:time_slice + numcontext + 1] assert (len(empty_source_future) + len(data_source_future) == numcontext) if need_empty_past: past = np.concatenate((empty_source_past, data_source_past)) else: past = data_source_past if need_empty_future: future = np.concatenate((data_source_future, empty_source_future)) else: future = data_source_future past = np.reshape(past, numcontext * numcep) now = orig_inputs[time_slice] future = np.reshape(future, numcontext * numcep) train_inputs[time_slice] = np.concatenate((past, now, future)) assert (len(train_inputs[time_slice]) == numcep + 2 * numcep * numcontext) # 将数据使用正太分布标准化,减去均值然后再除以方差 train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs) return train_inputs
def get_speech_features(signal, fs, num_features, pad_to=8, features_type='spectrogram', window_size=20e-3, window_stride=10e-3, augmentation=None): """Function to convert raw audio signal to numpy array of features. Args: signal (np.array): np.array containing raw audio signal. fs (float): frames per second. num_features (int): number of speech features in frequency domain. pad_to (int): if specified, the length will be padded to become divisible by ``pad_to`` parameter. features_type (string): 'mfcc' or 'spectrogram'. window_size (float): size of analysis window in milli-seconds. window_stride (float): stride of analysis window in milli-seconds. augmentation (dict, optional): dictionary of augmentation parameters. See :func:`get_speech_features_from_file` for specification and example. Returns: np.array: np.array of audio features with shape=[num_time_steps, num_features]. """ if augmentation is not None: if 'time_stretch_ratio' not in augmentation: raise ValueError('time_stretch_ratio has to be included in augmentation ' 'when augmentation it is not None') if 'noise_level_min' not in augmentation: raise ValueError('noise_level_min has to be included in augmentation ' 'when augmentation it is not None') if 'noise_level_max' not in augmentation: raise ValueError('noise_level_max has to be included in augmentation ' 'when augmentation it is not None') signal = augment_audio_signal(signal, fs, augmentation) n_window_size = int(fs * window_size) n_window_stride = int(fs * window_stride) # making sure length of the audio is divisible by 8 (fp16 optimization) length = 1 + int(math.ceil( (1.0 * signal.shape[0] - n_window_size) / n_window_stride) ) if pad_to > 0: if length % pad_to != 0: pad_size = (pad_to - length % pad_to) * n_window_stride signal = np.pad(signal, (0, pad_size), mode='reflect') if features_type == 'spectrogram': frames = psf.sigproc.framesig(sig=signal, frame_len=n_window_size, frame_step=n_window_stride, winfunc=np.hanning) # features = np.log1p(psf.sigproc.powspec(frames, NFFT=N_window_size)) features = psf.sigproc.logpowspec(frames, NFFT=n_window_size) assert num_features <= n_window_size // 2 + 1, \ "num_features for spectrogram should be <= (fs * window_size // 2 + 1)" # cut high frequency part features = features[:, :num_features] elif features_type == 'mfcc': features = psf.mfcc(signal=signal, samplerate=fs, winlen=window_size, winstep=window_stride, numcep=num_features, nfilt=2*num_features, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=2*num_features, appendEnergy=False) else: raise ValueError('Unknown features type: {}'.format(features_type)) assert features.shape[0] % pad_to == 0 m = np.mean(features) s = np.std(features) features = (features - m) / s return features
def file2mfcc(fileName): (rate, sig) = wav.read(fileName) if len(sig) != 16000: return False, [] mfcc_feat = mfcc(sig, rate) return True, mfcc_feat
def getMFCC(data, fs, ANALYSIS_WINDOW, HOPSIZE): coeficientes = mfcc(signal=data, samplerate=fs, winlen=ANALYSIS_WINDOW*1./fs, winstep=HOPSIZE*1./fs, numcep=5) return coeficientes
def _generate_mel_spectrogram(audio_clip, sample_rate): mfcc = zip(*python_speech_features.mfcc(audio_clip, sample_rate)) audio_features = np.stack([np.array(i) for i in mfcc]) audio_features = np.expand_dims(audio_features, axis=0) return audio_features
# ^_^ coding:utf-8 ^_^ import numpy as np from scipy.io import wavfile import matplotlib.pyplot as plt from python_speech_features import mfcc, logfbank # 读取输入的音频文件 sampling_freq, audio = wavfile.read('input_freq.wav') # 提取MFCC和过滤器组特征 mfcc_features = mfcc(audio, sampling_freq) filterbank_features = logfbank(audio, sampling_freq) # 打印参数 print('MFCCL Number of windows = {}'.format(mfcc_features.shape[0])) print('Length of each feature = {}'.format(mfcc_features.shape[1])) print('Filter bank: Number of windows = {}'.format(filterbank_features.shape[0])) print('Length of each feature = {}'.format(filterbank_features.shape[1])) # 画出特征图 mfcc_features = mfcc_features.T plt.matshow(mfcc_features) plt.title('MFCC') # 将滤波器组特征可视化 filterbank_features = filterbank_features.T plt.matshow(filterbank_features) plt.title('Filter bank') plt.show()
def getMFCCFromFile(file_path='../recordings/all.ogg'): stereo_audio_data, sample_rate = sf.read(file_path) mono_audio_data = stereo_audio_data[:,1] #uses 2nd channel :) data = list(mono_audio_data) #Convert data into matrix of frames # print(len(data)) frames = [] for i in range(0, len(data), 200): frames.append(data[i:i+200]) lastFrame = len(frames) -1 padder = 200 - len(frames[lastFrame]) % 200 pprint(padder) frames[lastFrame].extend([0] * padder) print(len(frames)) window = 1/200 #frame size ==> w(n) = 1/2N def sgn(val): if(val >= 0): return 1; return -1; zcr = [] for each_row in frames: zcr_inner = 0; for index, each_record in enumerate(each_row): if (index == 0): continue zcr_inner = zcr_inner + abs(sgn(each_record) - sgn(each_row[index-1]))/400 zcr.append(zcr_inner); ste = [] for each_row in frames: ste_inner = 0; for index, each_record in enumerate(each_row): ste_inner = ste_inner + ((each_record*(0.54 - 0.46 * cos(2*pi*(index+1)/199)))**2) ste.append(ste_inner) #assert(len(ste) == len(zcr)) #calculate multiplier # multiplier = []; for index, each_record in enumerate(zcr): if(each_record <= ste[index]): multiplier.append(1) else: multiplier.append(0) print(multiplier); total = [] for each_multiplier in multiplier: total.extend([each_multiplier] * 200) padded_data = data; padded_data.extend([0]*padder) #assert(len(padded_data) == len(total)) multiplied = [x * y for x, y in zip(padded_data, total)] #First plot the multiplier # plt.plot(numpy.asfarray(padded_data)) plt.plot(numpy.asfarray(total)) plt.savefig('amono_audio_data_all_processed_superimposed.png') plt.clf() plt.plot(numpy.asfarray(multiplied)) plt.savefig('amono_audio_data_all_processed.png') plt.clf() sf.write('final.wav', multiplied, sample_rate) # Frame "multiplied" (which is already padded) with some overlap and calculate MFCC for each frame. # You get an vectors of MFCC coefficients # Store that in some db, look for how to build HMM based clasifier based on those MFCCs # Tell others to find how to use the classifer to get MFCC as input and get output as sequence of words # or phonemes # Build a phonetic dictionary # See YAHMM indexes = [ i for i, (x, y) in enumerate(zip(multiplier[:-1],multiplier[1:])) if x!=y] pprint(indexes) ##Framing routine # #Do not neeed indexes, just do the thing in multiplied. # #Use indexes for comparision of accuracy in Total number of words recognized Vs Actual number of words # # #Framing, each frame starts from 80th sample, with size 200 #Using MFCC library, we can eradicate the following code: # mfcc_feat = mfcc(numpy.asarray(multiplied), sample_rate); pprint(mfcc_feat); pprint(len(mfcc_feat[0])) return mfcc_feat
num_layers = 1 batch_size = 1 initial_learning_rate = 1e-2 momentum = 0.9 num_examples = 1 num_batches_per_epoch = int(num_examples/batch_size) # Loading the data audio_filename = maybe_download('LDC93S1.wav', 93638) target_filename = maybe_download('LDC93S1.txt', 62) fs, audio = wav.read(audio_filename) inputs = mfcc(audio, samplerate=fs) # Tranform in 3D array train_inputs = np.asarray(inputs[np.newaxis, :]) train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs) train_seq_len = [train_inputs.shape[1]] # Readings targets with open(target_filename, 'r') as f: #Only the last line is necessary line = f.readlines()[-1] # Get only the words between [a-z] and replace period for none original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '') targets = original.replace(' ', ' ') targets = targets.split(' ')
from python_speech_features import mfcc from python_speech_features import logfbank import scipy.io.wavfile as wav import numpy as np (rate, sig) = wav.read( "C:\\Users\\Admin\\Music\\dulieuhocmay\\test_tatden\\Recording (1009).wav") mfcc_feat = mfcc(sig, rate, 400 / rate, 160 / rate) fbank_feat = logfbank(sig, rate, 400 / rate, 160 / rate) for i in sig: print(i) # print(rate) a = mfcc_feat[0:900] # print(a.shape) a = a.ravel() a = a.tolist() if 11700 > len(a): l = len(a) a = a + [0] * (11700 - l) t = 0 # print(a)
def get_audio_features(audio, feature_type, on_error=None, **kwargs): """ Returns audio features. # Arguments audio: dict or str. If dict, it should have keys, values as returned by `load_audio()`. If str, it should be a file that will be passed to `load_audio()`. feature_type. str. One of: - raw: Returns raw audio data (1-dimensional) - mfcc: Returns MFCC features - spec: Returns a spectrogram on_error: str or None (default: None). One of: - 'raise' or None: let the error propagate (no special catching) - 'suppress': catch the error and return None kwargs. Additional arguments that depend on `feature_type`: - For 'raw': no additional parameters - For 'mfcc': - features: int (default: 13). The number of MFCC features to keep. - low_freq: int (default: None). The low-frequency cutoff. - high_freq: int (default: None). The high-frequency cutoff. - For 'spec': - low_freq: int (default: None). The low-frequency cutoff. - high_freq: int (default: None). The high-frequency cutoff. """ assert on_error in (None, 'suppress', 'raise') if isinstance(audio, str): original_path = audio try: audio = load_audio(audio) except Exception: # pylint: disable=broad-except logger.exception('Failed to load audio file: %s', audio) if on_error == 'suppress': return None else: raise else: original_path = None if len(audio['signal']) < 1: logger.error('Failed to produce audio features while processing file ' '%s. Length: %d. Sample rate: %d.', original_path, len(audio['signal']), audio['sample_rate']) if on_error == 'suppress': return None else: raise ValueError('Audio data is too short.') if feature_type == 'raw': return audio['signal'] elif feature_type == 'mfcc': try: import python_speech_features except ImportError: logger.exception('"python_speech_features" is a required Python ' 'dependency for calculating MFCC features.') raise num_features = kwargs.get('features') or 13 return python_speech_features.mfcc( audio['signal'], audio['sample_rate'], numcep=num_features, nfilt=num_features*2, lowfreq=kwargs.get('low_freq') or 0, highfreq=kwargs.get('high_freq') or None ) elif feature_type == 'spec': # Window size, in seconds window_size = 0.020 # Step size, in seconds step_size = 0.010 signal = scale_signal(audio) hop_size = int(step_size * audio['sample_rate']) frame_size = int(window_size * audio['sample_rate']) if len(signal) < frame_size: logger.error('Failed to produce FFT while processing file ' '%s. Original length: %d. Hop size: %d. Frame size: %d. ' 'Sample rate: %d.', original_path, len(signal), hop_size, frame_size, audio['sample_rate']) if on_error == 'suppress': return None else: raise ValueError('Audio data is too short.') # Cleave off any samples that do not cleanly fit into our step size. remove = (len(signal) - frame_size) % hop_size if remove: clean = signal[:-remove] else: clean = signal # Optimization: instead of doing a for loop or list comprehension to # apply the window to the signal, we can just create a new view into # the data with each window. num_frames = (len(clean) - frame_size) // hop_size + 1 frames = numpy.lib.stride_tricks.as_strided( clean, shape=(frame_size, num_frames), strides=(clean.strides[0], clean.strides[0] * hop_size) ) filter_window = numpy.hanning(frame_size) fft = numpy.fft.rfft( frames * numpy.expand_dims(filter_window, -1), axis=0 ) norm = numpy.absolute(fft)**2 scale = numpy.sum(filter_window**2) * audio['sample_rate'] scaled = norm scaled[1:-1] /= scale/2 scaled[[0, -1]] /= scale spec = scaled # At this point, `spec` is shape (frequency, time). # Apply frequency cutoffs, if necessary low_freq = kwargs.get('low_freq') high_freq = kwargs.get('high_freq') if low_freq or high_freq: # Number of frequency bins num_bins = spec.shape[0] # Width of each frequency bin. delta_freq = 1 / window_size # Calculate the bin that a frequency would fall into. get_bin = lambda f, alt: \ ( min( max(int(f / delta_freq + 0.5), 0) + 1, num_bins ) if f else alt ) spec = spec[get_bin(low_freq, 0):get_bin(high_freq, num_bins)] spec = numpy.log(spec + 1e-14) # Format `spec` as (time, frequency) spec = spec.T return spec else: raise ValueError('Unsupported feature type: {}'.format(feature_type))
import glob from scipy.io.wavfile import read from python_speech_features import mfcc from python_speech_features import delta from python_speech_features import logfbank import scipy.io.wavfile as wav import numpy as np for i in range (1,10): for y in range(1,10): wavs = [] dirName = str(i)+'/'+str(y) +'/' for filename in glob.glob(dirName+'*.wav'): (rate,sig) = wav.read(filename) mfcc_feat = mfcc(sig,rate,0.025,0.01,13,26,1200) #padding begins b=np.zeros((14352, 13)) result= np.zeros(b.shape) result[:mfcc_feat.shape[0],:mfcc_feat.shape[1]] = mfcc_feat #end of padding wavs.append(result.ravel()) # wavs.append(mfcc_feat) thefile = open(dirName+str(i)+'_'+str(y)+'_'+'MFCCfeatures.txt', 'w') for x in range (len(wavs)): for item in wavs[x]: thefile.write("%s " % item) thefile.write("\n\n\n")
def get_features(filepath): (rate, signal) = wav.read(filepath) mfcc_features = mfcc(signal=signal, samplerate=rate, winlen=0.025, winstep=0.01, winfunc=lambda m: hamming(m), appendEnergy=False) return mfcc_features
if __name__ == '__main__': # recording part # print("please, enter your name:") # label = input("") label = "on_inspection" print("прочитайте речення в мікрофон:") print(str(random_sentence)) record_to_file('on_inspection.wav') # print("on_inspection: done - result written to on_inspection.wav") label = 'on_inspection' (rate, sig) = wav.read("on_inspection.wav") mfcc_feat = mfcc(sig, rate, winlen=0.094, nfft=FFT_LENGTH, numcep=numcep, lowfreq=lowfreq, highfreq=highfreq) # le's print results print('\n\n') print( '============================================================================' ) print( '================================results:====================================' ) print( '============================================================================' ) print('\n\n')
for k in range(en_range + 1): if st_loop < end[k] and en_loop < end[k]: if phoneme[k] not in unvoiced: #unvoiced define karo label[i] = 1 else: label[i] = -1 break if k != en_range: if st_loop < end[k] and en_loop > end[k]: if ((phoneme[k] not in unvoiced) or (phoneme[k + 1] not in unvoiced)): #unvoiced define karo label[i] = 1 else: label[i] = -1 break #check MFCC length from python_speech_features import mfcc import scipy.io.wavfile as wav filename = "newtrainwithnoisep12\s" + str(j + 1) + ".wav" (Fs, data) = wav.read(filename) Obs = mfcc(data, samplerate=Fs, winlen=0.025, winstep=0.01, numcep=13) if len(Obs) > len(label): append_n = len(Obs) - len(label) for i in range(append_n): label[n + i] = 0
hop_length = 512 win_length = 1024 # Janela e overlapping (em tempo) win_len = win_length / rate win_hop = hop_length / rate lifter = 22 fmin = 0 fmax = rate / 2 coef_pre_enfase = 0.97 append_energy = 0 attr = mfcc( signal=signal, samplerate=rate, winlen=win_len, winstep=win_hop, numcep=n_mfcc, nfilt=n_mels, nfft=n_fft, lowfreq=fmin, highfreq=fmax, preemph=coef_pre_enfase, ceplifter=lifter, appendEnergy=append_energy, winfunc=hann ) Visualization.plot_cepstrals( attr, fig_name="./normal_40.png")
def wav2feature(wav_paths, feature_type='logfbank', feature_dim=40, energy=True, delta1=True, delta2=True): """Read wav file & convert to MFCC or log mel filterbank features. Args: wav_paths (list): paths to a wav file batch_size (int, optional): the batch size feature_type (string, optional): logfbank or fbank or mfcc feature_dim (int, optional): the demension of each feature energy (bool, optional): if True, add energy delta1 (bool, optional): if True, add delta features delta2 (bool, optional): if True, add delta delta features Returns: inputs: A tensor of size `[B, T, input_size]` inputs_seq_len: A tensor of size `[B]` """ if feature_type not in ['logmelfbank', 'logfbank', 'fbank', 'mfcc']: raise ValueError( 'feature_type is "logmelfbank" or "logfbank" or "fbank" or "mfcc".') if not isinstance(wav_paths, list): raise ValueError('wav_paths must be a list.') if delta2 and not delta1: delta1 = True batch_size = len(wav_paths) max_time = 0 for wav_path in wav_paths: # Read wav file fs, audio = scipy.io.wavfile.read(wav_path) if len(audio) > max_time: max_time = len(audio) input_size = feature_dim if energy: input_size + 1 if delta2: input_size *= 3 elif delta1: input_size *= 2 inputs = None inputs_seq_len = np.zeros((batch_size,), dtype=np.int32) for i, wav_path in enumerate(wav_paths): if feature_type == 'mfcc': feat = mfcc(audio, samplerate=fs, numcep=feature_dim) if energy: energy_feat = fbank(audio, samplerate=fs, nfilt=feature_dim)[1] feat = np.c_[feat, energy_feat] else: fbank_feat, energy_feat = fbank( audio, samplerate=fs, nfilt=feature_dim) if feature_type == 'logfbank': fbank_feat = np.log(fbank_feat) feat = fbank_feat if energy: # logenergy = np.log(energy_feat) feat = np.c_[feat, energy_feat] if delta2: delta1_feat = _delta(feat, N=2) delta2_feat = _delta(delta1_feat, N=2) feat = np.c_[feat, delta1_feat, delta2_feat] elif delta1: delta1_feat = _delta(feat, N=2) feat = np.c_[feat, delta1_feat] # Normalize per wav feat = (feat - np.mean(feat)) / np.std(feat) if inputs is None: max_time = feat.shape[0] input_size = feat.shape[-1] inputs = np.zeros((batch_size, max_time, input_size)) inputs[i] = feat inputs_seq_len[i] = len(feat) return inputs, inputs_seq_len
for c in classes: wav_file = df[df.label == c].iloc[0, 0] signal, rate = librosa.load('wavfiles/' + wav_file, sr=44100) mask = envelope(signal, rate, 0.0005) signal = signal[mask] signals[c] = signal fft[c] = calc_fft(signal, rate) #### log f bank from speech features ###### bank = logfbank( signal[:rate], rate, nfilt=26, nfft=1103).T ### nfft is window size 44100/40 =1102.5 ######## fbank[c] = bank #### store values ##### mel = mfcc( signal[:rate], rate, numcep=13, nfilt=26, nfft=1103 ).T ###signal rate is 1 second , where .T is used for transpose ####### mfccs[c] = mel ########## plotting graph of signals ########## plot_signals(signals) plt.show() plot_fft(fft) plt.show() plot_fbank(fbank) plt.show() plot_mfccs(mfccs)
from keras.models import load_model ###################################### # load the data # then compute the mfcc as a feature vector of dimension (49,39) # 49 corresponds to the time steps and 39 the features in each time step # librosa.effects.trim though not used is used for trimming the sound file for useful information data = [] label = [] for i in range(0,79): for j in range(0,24): back, sr = sf.read("back_"+str(i) +"_"+str(j)+".wav") # back, index = librosa.effects.trim(back) x=mfcc(y = back, sr = sr, n_mfcc=39) i=49-x.shape[1] tp=np.zeros((39,i)) x=np.append(x,tp,axis=1); data.append(x.T) label.append(0) forward, sr = sf.read("forward_"+str(i) +"_"+str(j)+".wav") # forward, index = librosa.effects.trim(forward) x=mfcc(y = forward, sr = sr, n_mfcc=39) i=49-x.shape[1] tp=np.zeros((39,i)) x=np.append(x,tp,axis=1); data.append(x.T) label.append(1)
def process(self, y, sample_rate): return python_speech_features.mfcc( 32768 * y, samplerate=sample_rate, winlen=self.duration, winstep=self.step, numcep=self.coefs)