def process_data(wav_files, phn_files): max_step_size = 0 inputs = [] targets = [] for i in tqdm(range(len(wav_files))): # extract mfcc features from wav (rate, sig) = wav.read(wav_files[i]) mfcc_feat = mfcc(sig, rate) fbank_feat = logfbank(sig, rate) acoustic_features = join_features( mfcc_feat, fbank_feat) # time_stamp x n_features # extract label from phn phn_labels = [] with open(phn_files[i], 'rb') as csvfile: phn_reader = csv.reader(csvfile, delimiter=' ') for row in phn_reader: if row[2] == 'q': continue phn_labels.append( phoneme_set_39[phoneme_48_39.get(row[2], row[2])] - 1) inputs.append(acoustic_features) targets.append(phn_labels) return lists_batches(inputs, targets)
def get_mel_spec(sig, sample_rate): """ Compute log Mel-filter bank energy features from an audio signal. Args: sig (array like) : Signal data. Must be real. sample_rate (int) : Sample rate. Return: Mel spectrum. """ try: mel_spec = python_speech_features.logfbank(sig, samplerate=sample_rate, winlen=0.005, winstep=0.005, nfilt=21, preemph=0.97) mel_spec = mel_spec.T mini = np.amin(mel_spec) maxi = np.amax(mel_spec) if mini == maxi: return None else: mel_spec = np.ceil((mel_spec - mini) * (5 / np.abs(mini - maxi))) return mel_spec except Exception as e: save_as_exception("Root", "Mel Spectrum", e) return None
def process_wav(wav_file): (rate, sig) = wav.read(wav_file) mfcc_feat = mfcc(sig, rate) fbank_feat = logfbank(sig, rate) acoustic_features = join_features(mfcc_feat, fbank_feat) # time_stamp x n_features return acoustic_features
def gen_filtered_spec(self, filenames): x_data = [] for filename in filenames: fs, wav = wavfiles.read(filename) # print(wav.shape, filename) if len(wav.shape) > 1: wav = wav[:, 0] if wav.shape[0] < 441000: pad_with = 441000 - wav.shape[0] wav = np.pad(wav, (0, pad_with), 'constant', constant_values=(0)) elif wav.shape[0] > 441000: wav = wav[0:441000] kernel = [-1, 2, -1] Sxx = logfbank(wav, fs, winlen=0.04, winstep=0.02, nfft=2048, nfilt=40) delta = ndimage.convolve1d(Sxx, weights=kernel, axis=1, mode='nearest') delta_2 = ndimage.convolve1d(Sxx, weights=kernel, axis=0, mode='nearest') data = np.dstack((Sxx, delta, delta_2)) x_data.append( data.reshape(1, data.shape[0], data.shape[1], data.shape[2])) return np.vstack(x_data)
def gen_delta_delta(self, filenames): x_data = [] for filename in filenames: fs, wav = wavfiles.read(filename) # print(wav.shape, filename) if len(wav.shape) > 1: wav = wav[:, 0] if wav.shape[0] < 441000: pad_with = 441000 - wav.shape[0] wav = np.pad(wav, (0, pad_with), 'constant', constant_values=(0)) elif wav.shape[0] > 441000: wav = wav[0:441000] Sxx = logfbank(wav, fs, winlen=0.04, winstep=0.02, nfft=2048, nfilt=40) delta = librosa.feature.delta(Sxx, order=1) delta_2 = librosa.feature.delta(Sxx, order=2) data = np.dstack((Sxx, delta, delta_2)) x_data.append( data.reshape(1, data.shape[0], data.shape[1], data.shape[2])) return np.vstack(x_data)
def add_data(self, data_bytes): """Process audio data.""" if not data_bytes: return softmax_tensor = self.sess_.graph.get_tensor_by_name(self.output_name_) input_data = np.frombuffer(data_bytes, dtype='i2') / pow(2, 15) mels = logfbank(input_data, 16000, lowfreq=50.0, highfreq=4200.0, nfilt=36, nfft=1024, winlen=0.020, winstep=0.010) input = { 'fingerprint_4d:0': np.reshape(mels, (1, mels.shape[0], mels.shape[1], 1)) } #print('logfbank', mels.shape, time.time() - bt) bt = time.time() predictions, = self.sess_.run(softmax_tensor, input) return predictions[1]
def features(df, opt, path): df.reset_index(inplace=True) sign = {} signal_env = {} ffts = {} fbanks = {} mfccs = {} stf_f = {} for c in list(np.unique(df.Gender)): wav = df[df.Gender == c].iloc[0, 0] s, r = librosa.load(path + wav) sign[c] = s mask = envelope(s, r, opt.threshold) s = s[mask] signal_env[c] = s ffts[c] = fft(s, r) _, _, Zxx = signal.stft(s, fs=r, window='hann', nperseg=256) stf_f[c] = Zxx fbanks[c] = logfbank(s[:r], r, nfilt=opt.nfilt, nfft=1103).T mfccs[c] = mfcc(s[:r], r, numcep=opt.nfeat, nfilt=opt.nfilt, nfft=opt.nfft).T return sign, signal_env, fft, fbanks, mfccs, stf_f
def generate_train_data(text_filepath): fwb_train = open(mod_train_data_path, 'wb') fwb_test = open(mod_test_data_path, 'wb') train_data_list = list() test_data_list = list() train_label_list = list() test_label_list = list() with open(text_filepath, 'r', encoding='utf-8') as fr: while True: line = fr.readline() line = line.split() if not line: break samplerate, data = wavfile.read(line[0]) # print(data) # print(len(data)) logfb_feat = logfbank(data) # print(len(logfb_feat)) # print(logfb_feat) # transformer = Normalizer().fit(logfb_feat) # logfb_feat = transformer.transform(logfb_feat) logfb_feat = util_module.standardization_func(logfb_feat) temp_path = line[0].split('\\\\') if temp_path[2] == 'test': test_data_list.append(logfb_feat) test_label_list.append(int(line[-1])) elif temp_path[2] == 'train': one_train = list() one_train.append(logfb_feat) one_train.append(int(line[-1])) train_data_list.append(one_train) random.shuffle(train_data_list) temp_train_data = list() temp_train_label = list() for one in train_data_list: # print(one) temp_train_data.append(one[0]) temp_train_label.append(one[1]) train_data = np.asarray(temp_train_data) train_label = np.asarray(temp_train_label) test_data = np.asarray(test_data_list) test_label = np.asarray(test_label_list) np.savez_compressed(fwb_train, label=train_label, data=train_data, rate=samplerate) np.savez_compressed(fwb_test, label=test_label, data=test_data, rate=samplerate) return
def get_vector(sig, rate): vec = np.empty((1, 3)) start = 0 end = 320 while (sig.shape[0] >= end + 160): vad = webrtcvad.Vad() vad.set_mode(2) res = vad.is_speech(sig[start:end].tobytes(), rate) # speech probability zero_crosses = np.nonzero( np.diff(sig[start:end] > 0))[0].shape[0] / 0.02 # zero crosses f = scipy.fft(sig[start:end]) f0 = min(np.absolute(f)) # f0 frequency start = start + 160 end = end + 160 vec = np.vstack((vec, np.array([res, zero_crosses, f0], ndmin=2))) mfcc_feat = mfcc(sig, rate, numcep=12, winlen=0.020)[0:vec.shape[0], :] # mfcc fbank = logfbank(sig, rate, nfilt=5)[0:vec.shape[0], :] # log filterbank energies mfcc_grad = np.gradient(mfcc_feat, axis=0) # mfcc first derivative final_feature = np.hstack((mfcc_feat, mfcc_grad, fbank, vec)) return final_feature
def get_features_from_spectrogram_with_filterbank(filepath, sample_rate, N_FFT, window_size=30, step_size=10, eps=1e-10): (rate, width, sig) = wavio.readwav(filepath) sig = sig.ravel() # nperseg: Length of each segment # noverlap: Number of points to overlap between segments nperseg = int(round(window_size * sample_rate / 1e3)) noverlap = int(round(step_size * sample_rate / 1e3)) freqs, times, spec = signal.spectrogram(sig, fs=sample_rate, window='hann', nperseg=nperseg, noverlap=noverlap, nfft=N_FFT, detrend=False) mean = np.mean(spec, axis=0) std = np.std(spec, axis=0) spec = (spec - mean) / std fbank_feat = logfbank(spec.T.astype(np.float32) + eps, sample_rate, winlen=0.030, winstep=0.01, nfilt=26, nfft=512, lowfreq=0, highfreq=sample_rate / 4, preemph=0.97) return freqs, times, fbank_feat
def get_mfcc(audio_path): fs, audio = wav.read(audio_path) assert (fs == 16000) mfcc_feat = mfcc(audio, samplerate=fs, winlen=0.04, winstep=0.04) fbank_feat = logfbank(audio, samplerate=fs, winlen=0.04, winstep=0.04) ret = numpy.concatenate((mfcc_feat, fbank_feat), 1) return ret
def create_infer_batch(self): # self.hparams.in_wav1, self.hparams.in_wav2 are full paths of the wav file # for ex) /home/hdd2tb/ninas96211/dev_wav_set/id10343_pCDWKHjQjso_00002.wav wavs_list = [self.hparams.in_wav1, self.hparams.in_wav2] # file_name for ex) id10343_pCDWKHjQjso_00002 for wav_path in wavs_list: wav_id = os.path.splitext(os.path.basename(wav_path))[0] audio, sample_rate = vad_ex.read_wave(wav_path) vad = webrtcvad.Vad(1) frames = vad_ex.frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames) total_wav = b"" for i, segment in enumerate(segments): total_wav += segment print(wav_id + " : " + str(i) + "th segment appended") # Without writing, unpack total_wav into numpy [N,1] array # 16bit PCM 기준 dtype=np.int16 wav_arr = np.frombuffer(total_wav, dtype=np.int16) print("read audio data from byte string. np array of shape:" + str(wav_arr.shape)) logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=40) # file_name for ex, 'id10343_pCDWKHjQjso_00002' self.save_dict[wav_id] = logmel_feats num_frames = self.hparams.segment_length * 100 num_overlap_frames = num_frames * self.hparams.overlap_ratio dvector_dict = {} match = False prev_wav_name = "" for wav_name, feats in self.save_dict.items(): if wav_name.split("_")[0] == prev_wav_name: print("spk_id" + wav_name.split("_")[0]) match = True total_len = feats.shape[0] num_dvectors = int((total_len - num_overlap_frames) // (num_frames - num_overlap_frames)) print("num dvec:" + str(num_dvectors)) dvectors = [] for dvec_idx in range(num_dvectors): start_idx = int((num_frames - num_overlap_frames) * dvec_idx) end_idx = int(start_idx + num_frames) print("wavname: " + wav_name + " start_idx: " + str(start_idx)) print("wavname: " + wav_name + " end_idx: " + str(end_idx)) dvectors.append(feats[start_idx:end_idx, :]) dvectors = np.asarray(dvectors) dvector_dict[wav_name] = dvectors prev_wav_name = wav_name.split("_")[0] wav1_data = list(dvector_dict.values())[0] wav2_data = list(dvector_dict.values())[1] print("match: " + str(match)) print("wav1_data.shape:" + str(wav1_data.shape)) print("wav2_data.shape:" + str(wav2_data.shape)) return wav1_data, wav2_data, match
def _mel(y, sr, win_len, win_step, num_features, n_fft, f_min, f_max): """Convert a wav signal into a logarithmically scaled mel filterbank. Args: y (np.ndarray): Wav signal. sr (int): Sampling rate. win_len (float): Window length in seconds. win_step (float): Window stride in seconds. num_features (int): Number of features to generate. n_fft (int): Number of Fast Fourier Transforms. f_min (float): Minimum frequency to consider. f_max (float): Maximum frequency to consider. Returns: np.ndarray: Mel-filterbank. Shape: [time, num_features] """ mel = psf.logfbank(signal=y, samplerate=sr, winlen=win_len, winstep=win_step, nfilt=num_features, nfft=n_fft, lowfreq=f_min, highfreq=f_max, preemph=0.97) return mel
def extract_feature(sig, rate, feature_type="MFCC", cmvn=True, delta=True): """ 从音频文件中抽取特征. Returns: 特征向量, 规模是[帧数, 特征向量维度] """ if feature_type == "fbank": feature = psf.logfbank(sig, rate) elif feature_type == "MFCC": feature = psf.mfcc(sig, rate) else: raise ValueError("不支持的特征抽取方法.") if cmvn: cmvn_stats = np.zeros((2, feature.shape[1] + 1)) cmvn_stats[0, :-1] = feature.sum(axis=0) cmvn_stats[0, -1] = feature.shape[0] cmvn_stats[1, :-1] = (feature**2).sum(axis=0) norm_stats = np.zeros((2, feature.shape[1])) mean = cmvn_stats[0, :-1] / cmvn_stats[0, -1] var = cmvn_stats[1, :-1] / cmvn_stats[0, -1] - mean * mean var = np.maximum(var, 1e-20) scale = 1 / np.sqrt(var) norm_stats[0] = -(mean * scale) norm_stats[1] = scale feature = np.dot(feature, np.diag(norm_stats[1])) feature += norm_stats[0] if delta: delta_feat = psf.delta(feature, 2) delta_delta_feat = psf.delta(delta_feat, 2) feature = np.column_stack((feature, delta_feat, delta_delta_feat)) return feature
def build_rand_feat(): tmp = check_data() if tmp: return tmp.data[0], tmp.data[1] X = [] y = [] _min, _max = float('inf'), -float('inf') for _ in tqdm(range(n_samples)): rand_class = np.random.choice(class_dist.index, p=prob_dist) file = np.random.choice(df[df.label==rand_class].index) rate, wav= wavfile.read('clean/'+file) label = df.at[file, 'label'] rand_index = np.random.randint(0, wav.shape[0]-config.step) sample = wav[rand_index:rand_index+config.step] X_sample = logfbank(sample, rate, nfilt=config.nfilt, nfft=config.nfft) _min = min(np.amin(X_sample), _min) _max = max(np.amax(X_sample), _max) X.append(X_sample) y.append(classes.index(label)) config.min = _min config.max = _max X, y = np.array(X), np.array(y) X = (X - _min) / (_max - _min) if config.mode == 'conv': X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1) elif config.mode == 'time': X = X.reshape(X.shape[0], X.shape[1], X.shape[2]) y = to_categorical(y, num_classes=40) config.data = (X, y) with open(config.p_path, 'wb') as handle: pickle.dump(config, handle, protocol=2) return X, y
def make_feature(y, sr): if FEATURE == 'fft': S = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LEN, window=hamming) feature, _ = librosa.magphase(S) feature = np.log1p(feature) feature = feature.transpose() else: if FEATURE == 'fbank': feature = logfbank(y, sr, nfilt=FEATURE_LEN, winlen=WIN_LEN, winstep=WIN_STEP) assert feature.shape[-1] == FEATURE_LEN, '{}'.format( feature.shape[-1]) else: feature = mfcc(y, sr, nfilt=FEATURE_LEN, winlen=WIN_LEN, winstep=WIN_STEP) feature_d1 = delta(feature, N=1) feature_d2 = delta(feature, N=2) feature = np.hstack([feature, feature_d1, feature_d2]) return normalize(feature).astype(np.float32)
def draw_multiple_graph_with_one_data(data, sr, **kwargs): if "x_number" in kwargs.keys(): x_num = kwargs['x_number'] if "y_number" in kwargs.keys(): y_num = kwargs['y_number'] plt.subplot(x_num, y_num, 1) plt.plot(data) plt.title('Raw Signal') plt.xlabel('sample rate') plt.ylabel('amplitude') data = logfbank(data, sr) data = draw_single_graph.transpose_the_matrix(data) plt.subplot(x_num, y_num, 2) plt.pcolormesh(data) plt.title('Feature Vector') plt.xlabel('frame sequence') plt.ylabel('number of filters') plt.colorbar() data = draw_single_graph.new_minmax_normal([data]) data = data[0] plt.subplot(x_num, y_num, 3) plt.pcolormesh(data) plt.title('Normalized Feature Vector') plt.xlabel('frame sequence') plt.ylabel('number of filters') plt.colorbar() return
def generator(): _wav_files, _labels = _shuffle(wav_files, labels) for wav_file, label in zip(_wav_files, _labels): signal, sample_rate, _ = read_audio(wav_file, desired_ms) if input_feature == 'fbank': feat, _ = fbank(signal, sample_rate, winlen=window_size_ms / 1000, winstep=window_stride_ms / 1000, nfilt=input_feature_dim) elif input_feature == 'logfbank': feat = logfbank(signal, sample_rate, winlen=window_size_ms / 1000, winstep=window_stride_ms / 1000, nfilt=input_feature_dim) elif input_feature == 'mfcc': feat = mfcc(signal, sample_rate, winlen=window_size_ms / 1000, winstep=window_stride_ms / 1000, nfilt=input_feature_dim, numcep=input_feature_dim) elif input_feature == 'raw': feat = np.expand_dims(signal, 1) yield (feat, label)
def __call__(self, data): fname = data['fname'] samples, sample_rate = librosa.load(fname, self.sample_rate) audio_duration = len(samples) * 1.0 / sample_rate # T, F features = psf.logfbank(signal=samples, samplerate=sample_rate, winlen=self.window_size, winstep=self.window_stride, nfilt=self.num_features, nfft=512, lowfreq=0, highfreq=sample_rate / 2, preemph=0.97) m = np.mean(features) s = np.std(features) features = (features - m) / s data = { 'target': data['text'], 'target_length': len(data['text']), 'input': features.astype(np.float32), 'input_length': features.shape[0] } # data['sample_rate'] = sample_rate # data['duration'] = audio_duration return data
def create_pickle(self, path, wav_arr, sample_rate): if round( (wav_arr.shape[0] / sample_rate), 1) > self.hparams.segment_length: save_dict = {} logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=self.hparams.spectrogram_scale) print("created logmel feats from audio data. np array of shape:" + str(logmel_feats.shape)) save_dict["LogMel_Features"] = logmel_feats if self.hparams.data_type == "vox1" or self.hparams.data_type == "vox2": data_id = "_".join(path.split("/")[-3:]) save_dict["SpkId"] = path.split("/")[-3] save_dict["ClipId"] = path.split("/")[-2] save_dict["WavId"] = path.split("/")[-1] if self.hparams.data_type == "vox1": pickle_f_name = data_id.replace("wav", "pickle") elif self.hparams.data_type == "vox2": pickle_f_name = data_id.replace("m4a", "pickle") elif self.hparams.data_type == "libri": data_id = "_".join(path.split("/")[-2:]) save_dict["SpkId"] = path.split("/")[-2] save_dict["WavId"] = path.split("/")[-1] pickle_f_name = data_id.replace("wav", "pickle") print(pickle_f_name) with open(self.hparams.pk_dir + "/" + pickle_f_name, "wb") as f: pickle.dump(save_dict, f, protocol=3) else: print("wav length smaller than 1.6s: " + path)
def make_train_logfb(self, **kwarg): if "raw_filename" in kwarg.keys(): raw_filename = kwarg["raw_filename"] if "logfb_filename" in kwarg.keys(): logfb_filename = kwarg["logfb_filename"] num = 0 for raw_f, logfb_f in zip(raw_filename, logfb_filename): raw_f = self.return_path+'\\'+raw_f loaded_data = np.load(raw_f, allow_pickle=True) label = loaded_data['label'] raw_signal = loaded_data['data'] sample_rate = loaded_data['rate'] logfb_list = list() for sig, rate in zip(raw_signal, sample_rate): logfb_feat = logfbank(sig, rate) logfb_list.append(logfb_feat) num += 1 print(num) logfb_data = np.asarray(logfb_list) fwb = open(logfb_f, 'wb') np.savez_compressed(fwb, label=label, data=logfb_data, rate=sample_rate) fwb.close() return
def get_feature_vectors(file, directory, no_of_frames, start_frame): (rate, sig) = wav.read(os.path.join(directory, file)) fbank_feat = logfbank(sig, rate, nfft=2048) mfcc_feat = mfcc(sig, rate, winlen=0.032, winstep=0.016, numcep=13, nfft=2048) d_mfcc_feat = delta(mfcc_feat, 2) dd_mfcc_feat = delta(d_mfcc_feat, 2) mfcc_vectors = mfcc_feat[start_frame:start_frame + no_of_frames, :no_of_features] dmfcc_vectors = d_mfcc_feat[start_frame:start_frame + no_of_frames, :no_of_features] ddmfcc_vectors = dd_mfcc_feat[start_frame:start_frame + no_of_frames, :no_of_features] fbank_vectors = fbank_feat[start_frame:start_frame + no_of_frames, :no_of_fbank_features] feature_vectors = numpy.hstack( (mfcc_vectors, dmfcc_vectors, ddmfcc_vectors, fbank_vectors)) return feature_vectors
def get_audio(audio_path, is_crop=True): try: # pdb.set_trace() wave_obj = wavio.read(audio_path) rate = wave_obj.rate sig = np.squeeze(wave_obj.data) # (rate,sig) = wav.read(ad) except TypeError: # print(ad) (rate,sig) = wav.read(audio_path) # only short than 10 seconds # if np.shape(sig)[0]/float(rate) > 10: # sig = sig[0:rate*10] # Mel-filter bank sig = sig - np.mean(sig) fbank_feat = logfbank(sig, rate, winlen=0.025,\ winstep=0.01,nfilt=40,nfft=512,lowfreq=0,highfreq=None,preemph=0.97) if is_crop: if fbank_feat.shape[0] < 1024: # pdb.set_trace() zero_pad = np.zeros((1024-fbank_feat.shape[0], 40)) fbank_feat = np.concatenate([fbank_feat, zero_pad], 0) else: fbank_feat = fbank_feat[:1024] return fbank_feat
def print_features(): # Read the input audio file file = '/home/james/Documents/git/SpeechRecognition/hmm-speech-recognition-0.1/audio/apple/apple01.wav' sampling_freq, signal = wavfile.read(file) # Take the first 10,000 samples for analysis signal = signal[:10000] # Extract the MFCC features features_mfcc = mfcc(signal, sampling_freq) # Print the parameters for MFCC print('\nMFCC:\nNumber of windows =', features_mfcc.shape[0]) print('Length of each feature =', features_mfcc.shape[1]) # Plot the features features_mfcc = features_mfcc.T plt.matshow(features_mfcc) plt.title('MFCC') # Extract the Filter Bank features features_fb = logfbank(signal, sampling_freq) # Print the parameters for Filter Bank print('\nFilter bank:\nNumber of windows =', features_fb.shape[0]) print('Length of each feature =', features_fb.shape[1]) # Plot the features features_fb = features_fb.T plt.matshow(features_fb) plt.title('Filter bank') plt.show()
def collect_features(samples, feature_type='mfcc', sr=48000, window_size_ms=20, window_shift_ms=10, num_filters=40, num_mfcc=40, window_function=None): '''Collects fbank and mfcc features. ''' if not window_function: # default for python_speech_features: def window_function(x): return np.ones((x,)) else: if 'hamming' in window_function: window_function = hamming elif 'hann' in window_function: window_function = hann else: # default for python_speech_features: def window_function(x): return np.ones((x,)) if len(samples)/sr*1000 < window_size_ms: window_size_ms = len(samples)/sr*1000 frame_length = calc_frame_length(window_size_ms, sr) if 'fbank' in feature_type: feats = logfbank(samples, samplerate=sr, winlen=window_size_ms * 0.001, winstep=window_shift_ms * 0.001, nfilt=num_filters, nfft=frame_length) elif 'mfcc' in feature_type: feats = mfcc(samples, samplerate=sr, winlen=window_size_ms * 0.001, winstep=window_shift_ms * 0.001, nfilt=num_filters, numcep=num_mfcc, nfft=frame_length) return feats, frame_length, window_size_ms
def mffcRead(str): (rate, sig) = wav.read(str) mfcc_feat = mfcc(sig, rate) #mfcc_feat = mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat = logfbank(sig, rate) return fbank_feat
def load_data(path): data = [] label_index = np.array([], dtype=int) label_count = 0 wav_files_count = 0 for root, dirs, files in os.walk(path): # get all wav files in current dir wav_files = [file for file in files if file.endswith('.wav')] data_same_person = [] # extract logfbank features from wav file for wav_file in wav_files: (rate, sig) = wav.read(root + "/" + wav_file) fbank_beats = logfbank(sig, rate, nfilt=40) # save logfbank features into same person array data_same_person.append(fbank_beats) # save all data of same person into the data array # the length of data array is number of speakers if wav_files: wav_files_count += len(wav_files) data.append(data_same_person) # return data, np.arange(len(data)) return data
def get_MFCC(sr, audio): features = mfcc.mfcc(audio, sr) ############################# # # # Noise Removal # # # ############################# features = mfcc.logfbank( audio) #computes the filterbank energy from an audio signal features = mfcc.lifter( features) #increases magnitude of high frequency DCT coefficients sum_of_squares = [] index = -1 for r in features: """ Since signals can be either positive or negative, taking n**2 allows us to compare the magnitudes """ sum_of_squares.append(0) index = index + 1 for n in r: sum_of_squares[index] = sum_of_squares[index] + n**2 strongest_frame = sum_of_squares.index(max(sum_of_squares)) hz = mfcc.mel2hz(features[strongest_frame] ) #converts the strongest frame's mfcc to hertz max_hz = max(hz) min_hz = min(hz) speech_booster = AudioEffectsChain().lowshelf( frequency=min_hz * (-1), gain=20.0, slope=0.5) #creates an audio booster that removes low hz y_speech_boosted = speech_booster(audio) #apply booster to original audio ############################# # # # FINAL MFCC CALCULATION # # # ############################# features = mfcc.mfcc(y_speech_boosted, sr, 0.025, 0.01, 16, nfilt=40, nfft=512, appendEnergy=False, winfunc=np.hamming) features = preprocessing.scale( features) #scaling to ensure that all values are within 0 and 1 return features
def load_and_fbank(filename, labels={}): X, y = [], [] for i, line in enumerate(open(filename, 'r')): # if i >= 200: # break line = line.strip() if line == '': continue if line.startswith('#'): continue row = line.split('\t') if len(row) < 2: sys.stderr.write('invalid record: {}\n'.format(line)) continue wave_file, emotion = row if emotion not in labels: labels[emotion] = len(labels) with wave.open(os.path.join(root_dir, wave_file), 'rb') as file: params = file.getparams() nchannels, sampwidth, framerate, nframes, comptipe, compname = params[: 6] str_data = file.readframes(nframes) wavedata = np.fromstring(str_data, dtype=np.short) winlen = 0.025 winstep = 0.01 nfft = int(winlen * framerate) nfilt = 40 mel_spec = ps.logfbank(wavedata, samplerate=framerate, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft) delta1 = ps.delta(mel_spec, 2) delta2 = ps.delta(delta1, 2) time = mel_spec.shape[0] # max_frames = 1024 # max_frames = 300 feature = np.empty((time, nfilt, 3), dtype=np.float32) feature[:, :, 0] = mel_spec feature[:, :, 1] = delta1 feature[:, :, 2] = delta2 X.append(feature) y.append(np.array([labels[emotion]], 'i')) logger.info('Loading dataset ... done.') sys.stdout.flush() return X, y, labels
def get_fbank_fea(self, wave_file): fs,audio = wav.read(wave_file) fbank_inputs = logfbank(audio, samplerate=fs) train_fbank_inputs = np.asarray(fbank_inputs[np.newaxis,:]) train_fbank_inputs = (train_fbank_inputs-np.mean(train_fbank_inputs))/np.std(train_fbank_inputs) train_seq_len = [train_fbank_inputs.shape[1]] return train_fbank_inputs, train_seq_len
def get_features(path): wav_name=path[path.rfind('/')+1:] # print 'wav_name:',wav_name label=wav_name[:3] # output_dir=sys.argv[2] output_dir='/home/sw/Shin/Codes/Deep-Rein4cement/One-shot-PGD/Omniglot/speech/dataset' output_dir=output_dir if output_dir[-1]=='/' else output_dir+'/' # print 'output_dir:',output_dir (rate,sig)=wav.read(path) logfbank0015_feat=logfbank(sig,rate,winstep=0.015,nfilt=40) feat_list=split_speech(logfbank0015_feat,100) return label,feat_list
def extract_fbank(sound): (rate,sig) = wav.read(StringIO.StringIO(sound)) fbank_feat = features.logfbank(sig,rate) return fbank_feat
def frequencybank(self, window_size=0.05, step_size=0.05, truncate=0.6): """Returns a Mel-frequency filter bank for this segment.""" x = self.signal if truncate: x = x[:int(truncate * self.sample_rate)] return logfbank(x, self.sample_rate, winlen=window_size, winstep=step_size)
#!/usr/bin/env python from python_speech_features import mfcc from python_speech_features import delta from python_speech_features import logfbank import scipy.io.wavfile as wav (rate,sig) = wav.read("english.wav") mfcc_feat = mfcc(sig,rate) d_mfcc_feat = delta(mfcc_feat, 2) fbank_feat = logfbank(sig,rate) print(fbank_feat[1:3,:])
# ^_^ coding:utf-8 ^_^ import numpy as np from scipy.io import wavfile import matplotlib.pyplot as plt from python_speech_features import mfcc, logfbank # 读取输入的音频文件 sampling_freq, audio = wavfile.read('input_freq.wav') # 提取MFCC和过滤器组特征 mfcc_features = mfcc(audio, sampling_freq) filterbank_features = logfbank(audio, sampling_freq) # 打印参数 print('MFCCL Number of windows = {}'.format(mfcc_features.shape[0])) print('Length of each feature = {}'.format(mfcc_features.shape[1])) print('Filter bank: Number of windows = {}'.format(filterbank_features.shape[0])) print('Length of each feature = {}'.format(filterbank_features.shape[1])) # 画出特征图 mfcc_features = mfcc_features.T plt.matshow(mfcc_features) plt.title('MFCC') # 将滤波器组特征可视化 filterbank_features = filterbank_features.T plt.matshow(filterbank_features) plt.title('Filter bank') plt.show()
def calculates_log_fbank(data): return logfbank(data, samplerate=SAMPLE_RATE, winlen=0.02, winstep=0.01, nfilt=40)