def testSpectrogramToMelMatrixChecksFrequencyBounds(self): # Lower edge must be >= 0, but 0 is OK. mfcc_mel.SpectrogramToMelMatrix(num_spectrogram_bins=513, audio_sample_rate=22050, num_mel_bins=20, lower_edge_hertz=0.0, upper_edge_hertz=4000.0) with self.assertRaises(ValueError): mfcc_mel.SpectrogramToMelMatrix(num_spectrogram_bins=513, audio_sample_rate=22050, num_mel_bins=20, lower_edge_hertz=-1.0, upper_edge_hertz=4000.0) # Upper edge must be <= Nyquist, but Nyquist is OK. mfcc_mel.SpectrogramToMelMatrix(num_spectrogram_bins=513, audio_sample_rate=22050, num_mel_bins=20, lower_edge_hertz=20.0, upper_edge_hertz=11025.0) with self.assertRaises(ValueError): mfcc_mel.SpectrogramToMelMatrix(num_spectrogram_bins=513, audio_sample_rate=22050, num_mel_bins=20, lower_edge_hertz=20.0, upper_edge_hertz=16000.0) # Must be a positive gap between edges. with self.assertRaises(ValueError): mfcc_mel.SpectrogramToMelMatrix(num_spectrogram_bins=513, audio_sample_rate=22050, num_mel_bins=20, lower_edge_hertz=20.0, upper_edge_hertz=20.0)
def testMelSpectrumAgreesWithGoldenValues(self): # Parallel dsp/mfcc:mel_spectrum_test. sample_count = 513 input_ = np.sqrt(np.arange(1, sample_count + 1))[np.newaxis, :] spec_to_mel_matrix = mfcc_mel.SpectrogramToMelMatrix( num_spectrogram_bins=sample_count, audio_sample_rate=22050, num_mel_bins=20, lower_edge_hertz=20.0, upper_edge_hertz=4000.0) mel_spectrum = np.dot(input_, spec_to_mel_matrix) expected = np.array( [7.422619, 10.30330648, 13.72703292, 17.24158686, 21.35253118, 25.77781089, 31.30624108, 37.05877236, 43.9436536, 51.80306637, 60.79867148, 71.14363376, 82.90910141, 96.50069158, 112.08428368, 129.96721968, 150.4277597, 173.74997634, 200.86037462, 231.59802942]) np.testing.assert_array_almost_equal(expected, mel_spectrum[0, :])
def build_mel_calculation_graph(waveform_input, sample_rate=16000, window_length_seconds=0.025, hop_length_seconds=0.010, num_mel_bins=64, lower_edge_hz=125.0, upper_edge_hz=7500.0, frame_width=96, frame_hop=10, tflite_compatible=False): """Build a TF graph to go from waveform to mel spectrum patches. Args: waveform_input: 1D Tensor which will be filled with 16 kHz waveform as tf.float32. sample_rate: Scalar giving the sampling rate of the waveform. Only 16 kHz is acceptable at present. window_length_seconds: Duration of window used for each Fourier transform. hop_length_seconds: Time shift between successive analysis time frames. num_mel_bins: The number of mel frequency bins to calculate. lower_edge_hz: Frequency boundary at bottom edge of mel mapping. upper_edge_hz: Frequency boundary at top edge of mel mapping. frame_width: The number of successive time frames to include in each patch. frame_hop: The frame advance between successive patches. tflite_compatible: Avoid ops not currently supported in tflite. Returns: Tensor holding [num_patches, frame_width, num_mel_bins] log-mel-spectrogram patches. """ # `waveform_input` is a [?] vector as a tensor. # `magnitude_spectrogram` is a [?, fft_length/2 + 1] tensor of spectrograms. # Derive the dependent parameters. window_length_samples = int(round(window_length_seconds * sample_rate)) hop_length_samples = int(round(hop_length_seconds * sample_rate)) fft_length = 2**int( math.ceil(math.log(window_length_samples) / math.log(2.0))) if tflite_compatible: magnitude_spectrogram = _stft_magnitude_tflite(waveform_input, window_length_samples, hop_length_samples, fft_length) else: magnitude_spectrogram = _stft_magnitude_full_tf( waveform_input, window_length_samples, hop_length_samples, fft_length) # Warp the linear-scale, magnitude spectrograms into the mel-scale. num_spectrogram_bins = magnitude_spectrogram.shape[-1].value if tflite_compatible: linear_to_mel_weight_matrix = tf.constant( mfcc_mel.SpectrogramToMelMatrix(num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hz, upper_edge_hz).astype(np.float32), name='linear_to_mel_matrix') else: # In full tf, the mel weight matrix is calculated at run time within the # TF graph. This avoids including a matrix of 64 x 256 float values (i.e., # 100 kB or more, depending on the representation) in the exported graph. linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hz, upper_edge_hz) mel_spectrogram = tf.matmul(magnitude_spectrogram, linear_to_mel_weight_matrix, name='mel_spectrogram') log_offset = 0.001 log_mel_spectrogram = tf.log(mel_spectrogram + log_offset, name='log_mel_spectrogram') # log_mel_spectrogram is a [?, num_mel_bins] gram. if tflite_compatible: features = _fixed_frame(log_mel_spectrogram, frame_length=frame_width, frame_step=frame_hop, first_axis=True) else: features = tf.signal.frame(log_mel_spectrogram, frame_length=frame_width, frame_step=frame_hop, axis=0) # features is [num_patches, frame_width, num_mel_bins]. return features