def test_mel_to_hz(infile): DATA = load(infile) z = librosa.mel_to_hz(DATA["f"], htk=DATA["htk"]) assert np.allclose(z, DATA["result"]) # Test for scalar conversion too z0 = librosa.mel_to_hz(DATA["f"][0], htk=DATA["htk"]) assert np.allclose(z0, DATA["result"][0])
def prepare_spectrogram_plot( self, type: SpectrogramType = SpectrogramType.power_level, frequency_scale: SpectrogramFrequencyScale = SpectrogramFrequencyScale. linear ) -> None: spectrogram = self.example.spectrogram(type, frequency_scale=frequency_scale) figure, axes = plt.subplots(1, 1) use_mel = frequency_scale == SpectrogramFrequencyScale.mel plt.title("\n".join( wrap("{0}{1} spectrogram for {2}".format( ("mel " if use_mel else ""), type.value, str(self)), width=100))) plt.xlabel("time (data every {}ms)".format( round(1000 / self.example.time_step_rate()))) plt.ylabel( "frequency (data evenly distributed on {} scale, {} total)".format( frequency_scale.value, self.example.frequency_count_from_spectrogram(spectrogram))) mel_frequencies = self.example.mel_frequencies() plt.imshow( spectrogram, cmap='gist_heat', origin='lower', aspect='auto', extent=[ 0, self.example.duration_in_s, librosa.hz_to_mel(mel_frequencies[0])[0] if use_mel else 0, librosa.hz_to_mel(mel_frequencies[-1])[0] if use_mel else self.example.highest_detectable_frequency() ]) plt.colorbar(label="{} ({})".format( type.value, "in{} dB, not aligned to a particular base level". format(" something similar to" if use_mel else "") if type == SpectrogramType. power_level else "only proportional to physical scale")) class ScalarFormatterWithUnit(ScalarFormatter): def __init__(self, unit: str): super().__init__() self.unit = unit def __call__(self, x, pos=None) -> str: return super().__call__(x, pos) + self.unit axes.xaxis.set_major_formatter(ScalarFormatterWithUnit("s")) axes.yaxis.set_major_formatter( FuncFormatter(lambda value, pos: "{}mel = {}Hz".format( int(value), int(librosa.mel_to_hz(value)[0])) ) if use_mel else ScalarFormatterWithUnit("Hz")) figure.set_size_inches(19.20, 10.80)
def retrieve_components(self, selection_order=None): if selection_order is None: return self.spectrogram S = np.zeros_like(self.spectrogram) + self.spectrogram.min() # following the order of segments in [Mishra 2017] Figure 4 temp_length = S.shape[1] // self.temporal_segments freq_length = S.shape[0] // self.frequency_segments left_over = S.shape[1] - temp_length * self.temporal_segments if left_over > 0: warnings.warn("Adding last {} frames to last segment".format(left_over)) def compute_f_start(f): return f * freq_length def compute_f_end(f): return compute_f_start(f) + freq_length if self.mel_scale: f_max = self.sr // 2 mel_max = librosa.hz_to_mel(f_max) hz_steps = librosa.mel_to_hz(list(range(0, int(np.ceil(mel_max)), int(mel_max // self.frequency_segments)))) hz_steps[-1:] = f_max def compute_f_start(f): return int(hz_steps[f] / f_max * 1025) # TODO don't hardcode this def compute_f_end(f): return int(hz_steps[f + 1] / f_max * 1025) for so in selection_order: t = so // self.frequency_segments f = so % self.frequency_segments t_start = t * temp_length if t == self.temporal_segments: t_end = S.shape[1] else: t_end = t_start + temp_length f_start = compute_f_start(f) f_end = compute_f_end(f) # print("f", f, f_start, f_end) S[f_start:f_end, t_start:t_end] = self.spectrogram[f_start:f_end, t_start:t_end] return S
def __test_to_hz(infile): DATA = load(infile) z = librosa.mel_to_hz(DATA['f'], DATA['htk']) assert np.allclose(z, DATA['result'])
def feature_extraction(y, fs=44100, statistics=True, include_mfcc0=True, include_delta=True, include_acceleration=True, mfcc_params=None, delta_params=None, acceleration_params=None): """Feature extraction, MFCC based features Outputs features in dict, format: { 'feat': feature_matrix [shape=(frame count, feature vector size)], 'stat': { 'mean': numpy.mean(feature_matrix, axis=0), 'std': numpy.std(feature_matrix, axis=0), 'N': feature_matrix.shape[0], 'S1': numpy.sum(feature_matrix, axis=0), 'S2': numpy.sum(feature_matrix ** 2, axis=0), } } Parameters ---------- y: numpy.array [shape=(signal_length, )] Audio fs: int > 0 [scalar] Sample rate (Default value=44100) statistics: bool Calculate feature statistics for extracted matrix (Default value=True) include_mfcc0: bool Include 0th MFCC coefficient into static coefficients. (Default value=True) include_delta: bool Include delta MFCC coefficients. (Default value=True) include_acceleration: bool Include acceleration MFCC coefficients. (Default value=True) mfcc_params: dict or None Parameters for extraction of static MFCC coefficients. delta_params: dict or None Parameters for extraction of delta MFCC coefficients. acceleration_params: dict or None Parameters for extraction of acceleration MFCC coefficients. Returns ------- result: dict Feature dict """ eps = numpy.spacing(1) # Windowing function if mfcc_params['window'] == 'hamming_asymmetric': window = scipy.signal.hamming(mfcc_params['n_fft'], sym=False) elif mfcc_params['window'] == 'hamming_symmetric': window = scipy.signal.hamming(mfcc_params['n_fft'], sym=True) elif mfcc_params['window'] == 'hann_asymmetric': window = scipy.signal.hann(mfcc_params['n_fft'], sym=False) elif mfcc_params['window'] == 'hann_symmetric': window = scipy.signal.hann(mfcc_params['n_fft'], sym=True) else: window = None #print 'y: ' + str(y.shape) ##print 'winlength: '+ str(mfcc_params['win_length']) # Calculate Static Coefficients magnitude_spectrogram = numpy.abs(librosa.stft(y + eps, n_fft=mfcc_params['n_fft'], win_length=mfcc_params['win_length'], hop_length=mfcc_params['hop_length'], center=True, window=window))**2 # print 'mag_spec: ' + str(magnitude_spectrogram.shape) mel_basis = librosa.filters.mel(sr=fs, n_fft=mfcc_params['n_fft'], n_mels=mfcc_params['n_mels'], fmin=mfcc_params['fmin'], fmax=mfcc_params['fmax'], htk=mfcc_params['htk']) mel_spectrum = numpy.dot(mel_basis, magnitude_spectrogram) mfcc = librosa.feature.mfcc(S=librosa.logamplitude(mel_spectrum)) #mfcc = magnitude_spectrogram # print 'mfcc dimensions: ' + str(mfcc.shape) # Collect the feature matrix mfcc = librosa.mel_to_hz(mfcc) feature_matrix = mfcc if include_delta: # Delta coefficients mfcc_delta = librosa.feature.delta(mfcc, **delta_params) # Add Delta Coefficients to feature matrix feature_matrix = numpy.vstack((feature_matrix, mfcc_delta)) if include_acceleration: # Acceleration coefficients (aka delta) mfcc_delta2 = librosa.feature.delta(mfcc, order=2, **acceleration_params) # Add Acceleration Coefficients to feature matrix feature_matrix = numpy.vstack((feature_matrix, mfcc_delta2)) if not include_mfcc0: # Omit mfcc0 feature_matrix = feature_matrix[1:, :] feature_matrix = feature_matrix.T print feature_matrix.shape # Collect into data structure if statistics: return { 'feat': feature_matrix, 'stat': { 'mean': numpy.mean(feature_matrix, axis=0), 'std': numpy.std(feature_matrix, axis=0), 'N': feature_matrix.shape[0], 'S1': numpy.sum(feature_matrix, axis=0), 'S2': numpy.sum(feature_matrix ** 2, axis=0), } } else: return { 'feat': feature_matrix}
tsr = 13000 y, sr = librosa.load(librosa.util.example_audio_file(), sr=tsr) y = librosa.hz_to_mel(y) D = librosa.stft(y, n_fft=1024) print(D.shape) lmag = np.log(np.abs(D) + 1) agl = np.angle(D) # / np.pi lmag, agl = torch.from_numpy(lmag), torch.from_numpy(agl) tensor = torch.stack((lmag, agl), 0) tensor = tensor.squeeze() mag = tensor[0, :, :].numpy() agl = tensor[1, :, :].numpy() mag = np.exp(mag) - 1 stft = mag * np.cos(agl) + (mag * np.sin(agl) * np.complex(0, 1)) y_hat = librosa.istft(stft) y = librosa.mel_to_hz(y) y_hat = librosa.mel_to_hz(y_hat) # y = librosa.resample(y, sr, tsr) # y_hat = librosa.resample(y, sr, tsr) librosa.output.write_wav('datasets/librosa_orig.wav', y, sr) librosa.output.write_wav('datasets/librosa_stft.wav', y_hat, sr) # %% import torch import torch.nn as nn import torch.nn.functional as F class Net(nn.Module): def __init__(self):
def mel_to_hz(y, **kwargs): return np.stack([librosa.mel_to_hz(y[i, :], **kwargs) for i in range(y.shape[0])])
def transform_non_affine(self, a): return librosa.mel_to_hz(a) / 1000.0
def test_mel2hz(): m = np.random.random(10) assert np.allclose(mel2hz(m), librosa.mel_to_hz(m, htk=True))