def _wav2feats(wavname): """ Extract features for wav 16k mono """ ext = os.path.splitext(wavname)[-1] assert ext.lower() == '.wav' or ext.lower() == '.wave' sig, read_framerate, sampwidth = read_wav(wavname) shp = sig.shape # wav should contain a single channel assert len(shp) == 1 or (len(shp) == 2 and shp[1] == 1) # wav sample rate should be 16000 Hz assert read_framerate == 16000 # LP: to be checked when sampwidth == 4 # assert sampwidth == 2 sig *= (2**(15 - sampwidth)) with warnings.catch_warnings() as w: # ignore warnings resulting from empty signals parts warnings.filterwarnings('ignore', message='divide by zero encountered in log', category=RuntimeWarning, module='sidekit') _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True) # Management of short duration segments difflen = 0 if len(loge) < 68: difflen = 68 - len(loge) warnings.warn( "media %s duration is short. Robust results require length of at least 720 milliseconds" % wavname) mspec = np.concatenate((mspec, np.ones((difflen, 24)) * np.min(mspec))) #loge = np.concatenate((loge, np.ones(difflen) * np.min(mspec))) return mspec, loge, difflen
def feat_from_raw(raw): # see features.py sampwidth = 2 nchannels = 1 nframes = len(raw) / sampwidth out = struct.unpack_from("%dh" % nframes * nchannels, raw) sig = np.reshape(np.array(out), (-1, nchannels)).squeeze() sig = sig.astype(np.float32) shp = sig.shape # wav should contain a single channel assert len(shp) == 1 or (len(shp) == 2 and shp[1] == 1) sig *= (2**(15 - sampwidth)) with warnings.catch_warnings() as w: # ignore warnings resulting from empty signals parts warnings.filterwarnings('ignore', message='divide by zero encountered in log', category=RuntimeWarning, module='sidekit') _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True) # Management of short duration segments difflen = 0 if len(loge) < 68: difflen = 68 - len(loge) warnings.warning( "media %s duration is short. Robust results require length of at least 720 milliseconds" % wavname) mspec = np.concatenate((mspec, np.ones((difflen, 24)) * np.min(mspec))) #loge = np.concatenate((loge, np.ones(difflen) * np.min(mspec))) return mspec, loge, difflen
def _wav2feats(wavname): """ """ ext = os.path.splitext(wavname)[-1] assert ext.lower() == '.wav' or ext.lower() == '.wave' sig, read_framerate, sampwidth = read_wav(wavname) shp = sig.shape # wav should contain a single channel assert len(shp) == 1 or (len(shp) == 2 and shp[1] == 1) # wav sample rate should be 16000 Hz assert read_framerate == 16000 sig *= (2**(15 - sampwidth)) _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True) return mspec, loge
def __data_generation_dnn(self, list_dirs_temp): # Initialization X = np.empty((self.batch_size, 39 * 21)) target = np.empty((self.batch_size, 8)) for i, item_path in enumerate(list_dirs_temp): if self.feat == 'mfcc': if self.precomputed: feat = np.load(item_path)[:, :, 0].T else: waveform, sr = sf.read(item_path) feat_item = mfcc(waveform + 1e-9, maxfreq=sr / 2.0, nwin=.128, shift=.032)[0] feat_delta1 = compute_delta(feat_item) feat_delta2 = compute_delta(feat_delta1) feat = np.concatenate( (feat_item, feat_delta1, feat_delta2), axis=1) else: if self.precomputed: feat = np.load(item_path)[:, :, 0].T else: waveform, sr = sf.read(item_path) feat_item = plp(waveform + 1e-9, fs=sr, rasta=False, nwin=.128, shift=.032)[0] feat_delta1 = compute_delta(feat_item) feat_delta2 = compute_delta(feat_delta1) feat = np.concatenate( (feat_item, feat_delta1, feat_delta2), axis=1) mirror_feat = np.pad(feat, ((10, ), (0, )), 'reflect') frames = [] for j in range(10 + i * 25, 10 + (i + 1) * 25): frames.append(np.reshape(mirror_feat[j - 10:j + 11, :], -1)) X[25 * i:25 * (i + 1), ] = np.array(frames) # Store class label = item_path.split('/')[-3] target_i = keras.utils.to_categorical(self.target_to_class[label], num_classes=self.n_classes) target[25 * i:25 * (i + 1), :] = np.repeat(target_i.reshape( (1, -1)), repeats=25, axis=0) return X, target
def __data_generation_lstm(self, item_path): # Generate data waveform, sr = sf.read(item_path) if self.feat == 'mfcc': feat_item = mfcc(waveform + 1e-9, maxfreq=sr / 2.0)[0] else: feat_item = plp(waveform + 1e-9, fs=sr, rasta=False)[0] feat_delta1 = compute_delta(feat_item) feat_delta2 = compute_delta(feat_delta1) feat = np.concatenate((feat_item, feat_delta1, feat_delta2), axis=1) # Store class label = item_path.split('/')[-3] y = self.target_to_class[label] target = np.zeros((feat.shape[0], self.n_classes)) target[:, y] = 1 return feat.reshape(1, *feat.shape), target
def _wav2feats(wavname): """ """ ext = os.path.splitext(wavname)[-1] assert ext.lower() == '.wav' or ext.lower() == '.wave' sig, read_framerate, sampwidth = read_wav(wavname) shp = sig.shape # wav should contain a single channel assert len(shp) == 1 or (len(shp) == 2 and shp[1] == 1) # wav sample rate should be 16000 Hz assert read_framerate == 16000 assert sampwidth == 2 sig *= (2**(15 - sampwidth)) with warnings.catch_warnings() as w: # ignore warnings resulting from empty signals parts warnings.filterwarnings('ignore', message='divide by zero encountered in log', category=RuntimeWarning, module='sidekit') _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True) return mspec, loge
def extract(self, *file): show, channel, input_audio_filename, output_feature_filename, extra = file[ 0] backing_store = True if input_audio_filename is not None: self.audio_filename_structure = input_audio_filename audio_filename = self.audio_filename_structure.format(show) # If the output file name does not include the ID of the show, # (i.e., if the feature_filename_structure does not include {}) # the feature_filename_structure is updated to use the output_feature_filename if output_feature_filename is not None: self.feature_filename_structure = output_feature_filename if extra: feature_filename = self.feature_filename_structure.format( show, extra) else: feature_filename = self.feature_filename_structure.format(show) # if os.path.exists(feature_filename): # return # Open audio file, get the signal and possibly the sampling frequency signal, sample_rate = read_audio(audio_filename, self.sampling_frequency) if signal.ndim == 1: signal = signal[:, np.newaxis] # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required length, chan = signal.shape # If the size of the signal is not enough for one frame, return zero features PARAM_TYPE = np.float32 if length < self.window_sample: cep = np.empty((0, self.ceps_number), dtype=PARAM_TYPE) energy = np.empty((0, 1), dtype=PARAM_TYPE) fb = np.empty((0, self.filter_bank_size), dtype=PARAM_TYPE) label = np.empty((0, 1), dtype='int8') else: # Random noise is added to the input signal to avoid zero frames. np.random.seed(0) signal[:, channel] += 0.0001 * np.random.randn(signal.shape[0]) dec = self.shift_sample * 250 * 25000 + self.window_sample dec2 = self.window_sample - self.shift_sample start = 0 end = min(dec, length) # Process the signal by batch to avoid problems for very long signals while start < (length - dec2): if self.feature_type == 'mfcc': # Extract cepstral coefficients, energy and filter banks cep, energy, _, fb = mfcc(signal[start:end, channel], fs=self.sampling_frequency, lowfreq=self.lower_frequency, maxfreq=self.higher_frequency, nlinfilt=self.filter_bank_size if self.filter_bank == "lin" else 0, nlogfilt=self.filter_bank_size if self.filter_bank == "log" else 0, nwin=self.window_size, shift=self.shift, nceps=self.ceps_number, get_spec=False, get_mspec=True, prefac=self.pre_emphasis) elif self.feature_type == 'plp': cep, energy, _, fb = plp(signal[start:end, channel], nwin=self.window_size, fs=self.sampling_frequency, plp_order=self.ceps_number, shift=self.shift, get_spec=False, get_mspec=True, prefac=self.pre_emphasis, rasta=self.rasta_plp) # Perform feature selection label, threshold = self._vad(cep, energy, fb, signal[start:end, channel]) # print(len(label[label])) if len(label) < len(energy): label = np.hstack( (label, np.zeros(len(energy) - len(label), dtype='bool'))) start = end - dec2 end = min(end + dec, length) # Create the HDF5 file # Create the directory if it dosn't exist dir_name = os.path.dirname(feature_filename) # get the path if not os.path.exists(dir_name) and (dir_name is not ''): os.makedirs(dir_name) h5f = h5py.File(feature_filename, 'a', backing_store=backing_store, driver='core') if "cep" not in self.save_param: cep = None # cep_mean = None # cep_std = None if "energy" not in self.save_param: energy = None # energy_mean = None # energy_std = None if "fb" not in self.save_param: fb = None if "vad" not in self.save_param: label = None cep, fb, label = self.postProc(cep, energy, fb, label) write_hdf5(show, h5f, cep, None, None, None, None, None, fb, None, None, None, None, None, label) h5f.close() pass
def getFeatures(self, signal, fs, winlength, *args): if len(args) == 0: calcmfccs = False ncep = 1 elif len(args) == 1: calcmfccs = True ncep = args[0] else: print 'Incorrect number of inputs.' signal = np.real(signal) if fs <= 0: fs = 44100 if isinstance(ncep, list): ncep = round(np.real(ncep[0])) else: ncep = round(np.real(ncep)) if ncep < 1: ncep = 1 if isinstance(winlength, list): winlength = np.real(winlength[0]) else: winlength = np.real(winlength) if winlength * fs < ncep: winlength = ncep / fs winshift = 0.5 minfreq = 20 maxfreq = 4000 nbands = 30 lifterexp = 0 preemph = 0 if calcmfccs == True: ceps, log_energy, spec, mspec = ff.mfcc(signal, lowfreq=minfreq, maxfreq=maxfreq, nlinfilt=lifterexp, nlogfilt=nbands, nwin=winlength, fs=fs, nceps=int(ncep), shift=winlength * winshift, prefac=preemph) output = [] output.append(ceps) else: output = [] if not output: spectram, log_energy = ff.power_spectrum(signal, fs=fs, win_time=winlength, shift=winlength * winshift, prefac=1) output.append(spectram) output.append(log_energy) return output
def extract(self, show, channel, input_audio_filename=None, output_feature_filename=None, backing_store=False): """ Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features for a single channel from a given audio file. :param show: ID if the show :param channel: channel number (0 if mono file) :param input_audio_filename: name of the input audio file to consider if the name of the audio file is independent from the ID of the show :param output_feature_filename: name of the output feature file to consider if the name of the feature file is independent from the ID of the show :param backing_store: boolean, if False, nothing is writen to disk, if True, the file is writen to disk when closed :param feature_type: can be mfcc or plp :param rasta: boolean, only for PLP parameters, if True, perform RASTA filtering :return: an hdf5 file handler """ # Create the filename to load # If the input audio file name does not include the ID of the show # (i.e., if the audio_filename_structure does not include {}) # the audio_filename_structure is updated to use the input_audio_filename if input_audio_filename is not None: self.audio_filename_structure = input_audio_filename audio_filename = self.audio_filename_structure.format(show) # If the output file name does not include the ID of the show, # (i.e., if the feature_filename_structure does not include {}) # the feature_filename_structure is updated to use the output_feature_filename if output_feature_filename is not None: self.feature_filename_structure = output_feature_filename feature_filename = self.feature_filename_structure.format(show) # Open audio file, get the signal and possibly the sampling frequency signal, sample_rate = read_audio(audio_filename, self.sampling_frequency) if signal.ndim == 1: signal = signal[:, numpy.newaxis] # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required length, chan = signal.shape # If the size of the signal is not enough for one frame, return zero features if length < self.window_sample: cep = numpy.empty((0, self.ceps_number), dtype=PARAM_TYPE) energy = numpy.empty((0, 1), dtype=PARAM_TYPE) fb = numpy.empty((0, self.filter_bank_size), dtype=PARAM_TYPE) label = numpy.empty((0, 1), dtype='int8') else: # Random noise is added to the input signal to avoid zero frames. numpy.random.seed(0) signal[:, channel] += 0.0001 * numpy.random.randn(signal.shape[0]) dec = self.shift_sample * 250 * 25000 + self.window_sample dec2 = self.window_sample - self.shift_sample start = 0 end = min(dec, length) # Process the signal by batch to avoid problems for very long signals while start < (length - dec2): logging.info('process part : %f %f %f', start / self.sampling_frequency, end / self.sampling_frequency, length / self.sampling_frequency) if self.feature_type == 'mfcc': # Extract cepstral coefficients, energy and filter banks cep, energy, _, fb = mfcc(signal[start:end, channel], fs=self.sampling_frequency, lowfreq=self.lower_frequency, maxfreq=self.higher_frequency, nlinfilt=self.filter_bank_size if self.filter_bank == "lin" else 0, nlogfilt=self.filter_bank_size if self.filter_bank == "log" else 0, nwin=self.window_size, shift=self.shift, nceps=self.ceps_number, get_spec=False, get_mspec=True, prefac=self.pre_emphasis) elif self.feature_type == 'plp': cep, energy, _, fb = plp(signal[start:end, channel], nwin=self.window_size, fs=self.sampling_frequency, plp_order=self.ceps_number, shift=self.shift, get_spec=False, get_mspec=True, prefac=self.pre_emphasis, rasta=self.rasta_plp) # Perform feature selection label, threshold = self._vad(cep, energy, fb, signal[start:end, channel]) if len(label) < len(energy): label = numpy.hstack((label, numpy.zeros(len(energy)-len(label), dtype='bool'))) start = end - dec2 end = min(end + dec, length) if cep.shape[0] > 0: logging.info('!! size of signal cep: %f len %d type size %d', cep[-1].nbytes/1024/1024, len(cep[-1]), cep[-1].nbytes/len(cep[-1])) # Compute the lean and std of fb and cepstral coefficient comuted for all selected frames energy_mean = energy[label].mean(axis=0) energy_std = energy[label].std(axis=0) fb_mean = fb[label, :].mean(axis=0) fb_std = fb[label, :].std(axis=0) cep_mean = cep[label, :].mean(axis=0) cep_std = cep[label, :].std(axis=0) # bnf_mean = bnf[label, :].mean(axis=0) # bnf_std = bnf[label, :].std(axis=0) # Create the HDF5 file # Create the directory if it dosn't exist dir_name = os.path.dirname(feature_filename) # get the path if not os.path.exists(dir_name) and (dir_name is not ''): os.makedirs(dir_name) h5f = h5py.File(feature_filename, 'a', backing_store=backing_store, driver='core') if "cep" not in self.save_param: cep = None cep_mean = None cep_std = None if "energy" not in self.save_param: energy = None energy_mean = None energy_std = None if "fb" not in self.save_param: fb = None fb_mean = None fb_std = None if "bnf" not in self.save_param: bnf = None bnf_mean = None bnf_std = None if "vad" not in self.save_param: label = None logging.info(label) write_hdf5(show, h5f, cep, cep_mean, cep_std, energy, energy_mean, energy_std, fb, fb_mean, fb_std, bnf, bnf_mean, bnf_std, label) return h5f
def extract_from_signal(self, signal, sample_rate, noise_file_name=None, snr=10, reverb_file_name=None, reverb_level=-26.): """ Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features for a single channel from a given audio file. :param show: ID if the show :param channel: channel number (0 if mono file) :param input_audio_filename: name of the input audio file to consider if the name of the audio file is independent from the ID of the show :param output_feature_filename: name of the output feature file to consider if the name of the feature file is independent from the ID of the show :param backing_store: boolean, if False, nothing is writen to disk, if True, the file is writen to disk when closed :param rasta: boolean, only for PLP parameters, if True, perform RASTA filtering :return: an hdf5 file handler """ if signal.ndim == 1: signal = signal[:, numpy.newaxis] # AJOUTER LE BRUITAGE ET REVERB DU SIGNAL SI NECESSAIRE if noise_file_name is not None: signal[:, 0] = _add_noise(signal[:, 0], noise_file_name, snr, sample_rate) if reverb_file_name is not None: signal[:, 0] = _add_reverb(signal[:, 0], reverb_file_name, sample_rate, reverb_level) # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required length, chan = signal.shape # If the size of the signal is not enough for one frame, return zero features if length < self.window_sample: cep = numpy.empty((0, self.ceps_number), dtype=PARAM_TYPE) energy = numpy.empty((0, 1), dtype=PARAM_TYPE) fb = numpy.empty((0, self.filter_bank_size), dtype=PARAM_TYPE) label = numpy.empty((0, 1), dtype='int8') else: # Random noise is added to the input signal to avoid zero frames. numpy.random.seed(0) signal[:, 0] += 0.0001 * numpy.random.randn(signal.shape[0]) dec = self.shift_sample * 250 * 25000 + self.window_sample dec2 = self.window_sample - self.shift_sample start = 0 end = min(dec, length) # Process the signal by batch to avoid problems for very long signals while start < (length - dec2): logging.info('process part : %f %f %f', start / self.sampling_frequency, end / self.sampling_frequency, length / self.sampling_frequency) if self.feature_type == 'mfcc': # Extract cepstral coefficients, energy and filter banks cep, energy, _, fb = mfcc(signal[start:end, 0], fs=self.sampling_frequency, lowfreq=self.lower_frequency, maxfreq=self.higher_frequency, nlinfilt=self.filter_bank_size if self.filter_bank == "lin" else 0, nlogfilt=self.filter_bank_size if self.filter_bank == "log" else 0, nwin=self.window_size, shift=self.shift, nceps=self.ceps_number, get_spec=False, get_mspec=True, prefac=self.pre_emphasis) elif self.feature_type == 'plp': cep, energy, _, fb = plp(signal[start:end, 0], nwin=self.window_size, fs=self.sampling_frequency, plp_order=self.ceps_number, shift=self.shift, get_spec=False, get_mspec=True, prefac=self.pre_emphasis, rasta=self.rasta_plp) # Perform feature selection label, threshold = self._vad(cep, energy, fb, signal[start:end, 0]) if len(label) < len(energy): label = numpy.hstack((label, numpy.zeros(len(energy) - len(label), dtype='bool'))) start = end - dec2 end = min(end + dec, length) if cep.shape[0] > 0: logging.info( '!! size of signal cep: %f len %d type size %d', cep[-1].nbytes / 1024 / 1024, len(cep[-1]), cep[-1].nbytes / len(cep[-1])) return label, energy, cep, fb
def __data_generation_cnn(self, list_dirs_temp): # Initialization y = np.empty((self.batch_size), dtype=int) if self.feat == 'combined': X1 = np.empty((self.batch_size, *self.dim1, self.n_channels)) X2 = np.empty((self.batch_size, *self.dim2, self.n_channels)) else: X = np.empty((self.batch_size, *self.dim, self.n_channels)) # Generate data labels = [] for i, item_path in enumerate(list_dirs_temp): if self.feat == 'melspec': waveform, sr = sf.read(item_path) mel = librosa.feature.melspectrogram(waveform, sr=16000) ps_db = librosa.power_to_db(mel, ref=np.max).reshape( (*self.dim, 1)) X[i, ] = (ps_db - np.mean(ps_db)) / np.var(ps_db) elif self.feat == 'mfcc': if self.precomputed: X[i, ] = np.load(item_path) else: waveform, sr = sf.read(item_path) feat_item = mfcc(waveform + 1e-9, maxfreq=sr / 2.0, nwin=.128, shift=.032)[0] feat_delta1 = compute_delta(feat_item) feat_delta2 = compute_delta(feat_delta1) feat = np.concatenate( (feat_item, feat_delta1, feat_delta2), axis=1).T.reshape((*self.dim, 1)) X[i, ] = (feat - np.mean(feat)) / np.var(feat) elif self.feat == 'plp': if self.precomputed: X[i, ] = np.load(item_path) else: waveform, sr = sf.read(item_path) feat_item = plp(waveform + 1e-9, fs=sr, rasta=False, nwin=.128, shift=.032)[0] feat_delta1 = compute_delta(feat_item) feat_delta2 = compute_delta(feat_delta1) feat = np.concatenate( (feat_item, feat_delta1, feat_delta2), axis=1).T.reshape((*self.dim, 1)) X[i, ] = (feat - np.mean(feat)) / np.var(feat) elif self.feat == 'combined': waveform, sr = sf.read(item_path) mel = librosa.feature.melspectrogram(waveform, sr=16000) ps_db = librosa.power_to_db(mel, ref=np.max).reshape( (*self.dim2, 1)) X2[i, ] = (ps_db - np.mean(ps_db)) / np.var(ps_db) feat_item = mfcc(waveform + 1e-9, maxfreq=sr / 2.0, nwin=.128, shift=.032)[0] feat_delta1 = compute_delta(feat_item) feat_delta2 = compute_delta(feat_delta1) feat = np.concatenate((feat_item, feat_delta1, feat_delta2), axis=1).T.reshape((*self.dim1, 1)) X1[i, ] = (feat - np.mean(feat)) / np.var(feat) # Store class label = item_path.split('/')[-3] labels.append(item_path) y[i] = self.target_to_class[label] target = keras.utils.to_categorical(y, num_classes=self.n_classes) if self.feat == 'combined': return (X1, X2), target return X, target