def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def wavfile_to_examples(wav_file): """Converting the waveform in to mel psectrum """ sr, wav_data = wavfile.read(wav_file) print(("SR, {}".format(sr))) print(("wav_data, {}, and shape is {}".format(wav_data, wav_data.shape))) print("max element in wav_data is {}".format(np.amax(wav_data))) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] data = wav_data sample_rate = sr if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int( round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int( round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def vggish_melspec(y, sr=22050, do_resample=False, frames=None): """ Extract melspec for vggish model """ if sr != vggish_params.SAMPLE_RATE and do_resample: if frames is not None: raise Exception("Resampled not supported with frames argument.") y = resample(y, sr, vggish_params.SAMPLE_RATE) sr = vggish_params.SAMPLE_RATE log_mel = mel_features.log_mel_spectrogram( y, frames=frames, audio_sample_rate=sr, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) return log_mel