def preprocess_sound(data, sample_rate): if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != SAMPLE_RATE: data = resampy.resample(data, sample_rate, SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=SAMPLE_RATE, log_offset=LOG_OFFSET, window_length_secs=STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=STFT_HOP_LENGTH_SECONDS, num_mel_bins=NUM_MEL_BINS, lower_edge_hertz=MEL_MIN_HZ, upper_edge_hertz=MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS example_window_length = int( round(EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round(EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def waveform_to_examples(data, sample_rate): # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def wavfile_to_examples(wav_file): sample_rate, wav_data = wavfile.read(wav_file) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype data = wav_data / 32768.0 # Convert to [-1.0, +1.0] # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != SAMPLE_RATE: data = resampy.resample(data, sample_rate, SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram(data, audio_sample_rate= SAMPLE_RATE, log_offset= LOG_OFFSET, window_length_secs= STFT_WINDOW_LENGTH_SECONDS, hop_length_secs= STFT_HOP_LENGTH_SECONDS, num_mel_bins= NUM_MEL_BINS, lower_edge_hertz= MEL_MIN_HZ, upper_edge_hertz= MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS example_window_length = int(round( EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def shorter_waveform_to_examples(data): """ Compute the spectrogram for each short audios Input: short audio data Output: list of spectrograms in this short audio, eahch with params.EXAMPLE_WINDOW_SECONDS, hopped by params.EXAMPLE_HOP_SECONDS """ # Compute log mel spectrogram features for each short audios log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=params.SAMPLE_RATE, log_offset=params.LOG_OFFSET, window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=params.NUM_MEL_BINS, # here forced the num_mel_bins lower_edge_hertz=params.MEL_MIN_HZ, upper_edge_hertz=params.MEL_MAX_HZ) #(data.shape[0]/params.SAMPLE_RATE*1000-25)/10+1 FRAMES x num_mel_bins # Frame features into examples # Each example is [100x513]->[100x64bins] (non-overlapping) features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS #frames every second example_window_length = int(round(params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round(params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def segment_long_audio(wav_file): """ segment the long audio into short audios, with duration of params.SHORT_AUDIO_WINDOW_LENGTH_MIN, overlapped by params.SHORT_AUDIO_HOP_LENGTH_MIN Input: original long audio wav file Output: list of its short audios """ sample_rate, wav_data = wavfile.read(wav_file) # single audio file assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype data = wav_data / 32768.0 # Convert to [-1.0, +1.0] # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) if len(data) == 0: return 0 # Resample to the 16000 if sample_rate != params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # frame the long audio into shorter ones data_example_window_length = params.SHORT_AUDIO_WINDOW_LENGTH_MIN * 60 * params.SAMPLE_RATE data_example_hop_length = params.SHORT_AUDIO_HOP_LENGTH_MIN * 60 * params.SAMPLE_RATE data_examples = mel_features.frame( data, window_length=data_example_window_length, hop_length=data_example_hop_length) return data_examples
def _waveform_to_mel_spectrogram_segments(data, sample_rate): """ Converts audio from a single wav file into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. Shape is (num_frame, ) sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is mel_params.STFT_HOP_LENGTH_SECONDS. IMPORTANT: if data.shape < (80000, ) then log_mel_examples.shape=(0, 496, 64). The zero is problematic downstream, so code will have to check for that. """ # Convert to mono if necessary. if len(data.shape) > 1: #print(f'DEBUG: audio channels before={data.shape}') data = np.mean(data, axis=1) #print(f'DEBUG: audio channels after={data.shape}') # Resample to the rate assumed by VGGish. if sample_rate != mel_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, mel_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = log_mel_spectrogram(data, audio_sample_rate=mel_params.SAMPLE_RATE, log_offset=mel_params.LOG_OFFSET, window_length_secs=mel_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=mel_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=mel_params.NUM_MEL_BINS, lower_edge_hertz=mel_params.MEL_MIN_HZ, upper_edge_hertz=mel_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / mel_params.STFT_HOP_LENGTH_SECONDS example_window_length = int( round(mel_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int( round(mel_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) # If log_mel.shape[0] < mel_params.NUM_FRAMES, log_mel_examples will return # an array with log_mel_examples.shape[0] = 0 log_mel_examples = frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) # print(f'DEBUG: data.shape={data.shape}') # print(f'DEBUG: log_mel_examples.shape={log_mel_examples.shape}') if log_mel_examples.shape[0] == 0: print('\nWARNING: audio sample too short! Using all zeros for that example.\n') return log_mel_examples
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ vprint('waveform_to_examples input data shape') vprint(data.shape) # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) vprint('waveform_to_examples resampled mono shape') vprint(data.shape) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) vprint('waveform_to_examples log_mel shape') vprint(log_mel.shape) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) vprint('waveform_to_examples log_mel reshaped') vprint(log_mel_examples.shape) return log_mel_examples
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
def wavform_to_frames(data, sample_rate): """Converts audio waveform into an array of windows to be linked with VGGish features""" # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS audio_sample_rate = vggish_params.SAMPLE_RATE window_length_samples = int(round(audio_sample_rate * window_length_secs)) hop_length_samples = int(round(audio_sample_rate * hop_length_secs)) frames = mel_features.frame(data, window_length_samples, hop_length_samples) return frames
def wavfile_to_examples(wav_file): sample_rate, wav_data = wavfile.read(wav_file) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype data = wav_data / 32768.0 # Convert to [-1.0, +1.0] # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) if len(data) == 0: return 0 if sample_rate != params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, params.SAMPLE_RATE) # Compute log mel spectrogram features for each short audios (log FBANK) log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=params.SAMPLE_RATE, log_offset=params.LOG_OFFSET, window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=params.NUM_MEL_BINS, # here forced the num_mel_bins lower_edge_hertz=params.MEL_MIN_HZ, upper_edge_hertz=params.MEL_MAX_HZ) features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round(params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round(params.EXAMPLE_HOP_SECONDS * features_sample_rate)) # added: zero pad the frame to expected frame number for each example log-mel FBANK if log_mel.shape[0] % params.NUM_FRAMES: pad_data = np.zeros((int(np.ceil(1.0*log_mel.shape[0]/params.NUM_FRAMES)*params.NUM_FRAMES),log_mel.shape[1])) pad_data[:log_mel.shape[0],:log_mel.shape[1]] = log_mel log_mel = pad_data log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples
frames_list = [] embeddings_list = [] for i in range(0, num_example_to_gen): #generate audio examples and get feature tensors for each example_waveform = generate_audio.gen_audio(2, 16000) examples_batch = vggish_input.waveform_to_examples( example_waveform, 16000) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) audio_frames = mel_features.frame(example_waveform, int(0.96 * 16000), int(0.96 * 16000)) #audio = audio[0:num_frames_to_keep] frames_list.append(audio_frames) embeddings_list.append(embedding_batch) print('adding number ' + str(i)) #print(embedding_batch) #postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) #100,000,000 floats should be about 200mb. That's about 6250*16000 = 100,000,000 #Will size output into numpy arrays roughly 200mb each, later to be used as TFrecord objects which like to be around that size. #convert to numpy and write to disk frames_array = np.array(frames_list) frames_array = np.reshape(frames_array, (-1, frames_list[0].shape[1]))
def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: - Length of the audio_sample after padding. - 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) ###################################################################### olength = len(data) temp_data = [] OVERLAP_SAMPLE_RATE = int(0.5 * vggish_params.SAMPLE_RATE) for i in range(0, len(data), OVERLAP_SAMPLE_RATE): end = i + vggish_params.SAMPLE_RATE chunk = data[i:min(end, len(data))] temp_data.extend(chunk) pad_length = vggish_params.SAMPLE_RATE - (len(temp_data) % OVERLAP_SAMPLE_RATE) temp_data = np.asarray(temp_data) # limit = int(np.ceil(2*len(data)/float(vggish_params.SAMPLE_RATE))) data = np.pad(temp_data, (0, pad_length), 'constant') ###################################################################### # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int( round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int( round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) return olength, len(data), log_mel_examples
def waveform_to_examples(data, sample_rate, file_path): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. if sample_rate != vggish_params.SAMPLE_RATE: data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # begin mod audio_sample_rate = vggish_params.SAMPLE_RATE log_offset = vggish_params.LOG_OFFSET window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS num_mel_bins = vggish_params.NUM_MEL_BINS lower_edge_hertz = vggish_params.MEL_MIN_HZ upper_edge_hertz = vggish_params.MEL_MAX_HZ #end mod # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int( round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int( round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame(log_mel, window_length=example_window_length, hop_length=example_hop_length) output_csv_dict = { "file_name": os.path.basename(file_path), "audio_sample_rate": audio_sample_rate, "log_offset": log_offset, "window_length_secs": window_length_secs, "hop_length_secs": hop_length_secs, "num_mel_bins": num_mel_bins, "lower_edge_hertz": lower_edge_hertz, "log_mel": log_mel } #dict_to_csv(output_csv_dict) return output_csv_dict, log_mel_examples