Esempio n. 1
0
def preprocess_sound(data, sample_rate):
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, SAMPLE_RATE)

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=SAMPLE_RATE,
        log_offset=LOG_OFFSET,
        window_length_secs=STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=NUM_MEL_BINS,
        lower_edge_hertz=MEL_MIN_HZ,
        upper_edge_hertz=MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(log_mel,
                                          window_length=example_window_length,
                                          hop_length=example_hop_length)
    return log_mel_examples
Esempio n. 2
0
def waveform_to_examples(data, sample_rate):
  # Convert to mono.
  if len(data.shape) > 1:
    data = np.mean(data, axis=1)

  # Compute log mel spectrogram features.
  log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

  # Frame features into examples.
  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
  example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
  example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
  log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)
  return log_mel_examples
Esempio n. 3
0
def wavfile_to_examples(wav_file):
  
	sample_rate, wav_data = wavfile.read(wav_file)
	assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
	data = wav_data / 32768.0 # Convert to [-1.0, +1.0]

	# Convert to mono.
	if len(data.shape) > 1:
		data = np.mean(data, axis=1)
	# Resample to the rate assumed by VGGish.
	if sample_rate != SAMPLE_RATE:
		data = resampy.resample(data, sample_rate, SAMPLE_RATE)

	# Compute log mel spectrogram features.
	log_mel = mel_features.log_mel_spectrogram(data,
											audio_sample_rate= SAMPLE_RATE,
											log_offset= LOG_OFFSET,
											window_length_secs= STFT_WINDOW_LENGTH_SECONDS,
											hop_length_secs= STFT_HOP_LENGTH_SECONDS,
											num_mel_bins= NUM_MEL_BINS,
											lower_edge_hertz= MEL_MIN_HZ,
											upper_edge_hertz= MEL_MAX_HZ)

	# Frame features into examples.
	features_sample_rate = 1.0 /  STFT_HOP_LENGTH_SECONDS
	example_window_length = int(round( EXAMPLE_WINDOW_SECONDS * features_sample_rate))
	example_hop_length = int(round( EXAMPLE_HOP_SECONDS * features_sample_rate))
	log_mel_examples = mel_features.frame(log_mel,
										window_length=example_window_length,
	
										hop_length=example_hop_length)

	return log_mel_examples
Esempio n. 4
0
def shorter_waveform_to_examples(data):
    """
    Compute the spectrogram for each short audios
    Input: short audio data
    Output: list of spectrograms in this short audio, eahch with params.EXAMPLE_WINDOW_SECONDS, hopped by params.EXAMPLE_HOP_SECONDS
    """
    # Compute log mel spectrogram features for each short audios
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=params.SAMPLE_RATE,
        log_offset=params.LOG_OFFSET,
        window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=params.NUM_MEL_BINS, # here forced the num_mel_bins
        lower_edge_hertz=params.MEL_MIN_HZ,
        upper_edge_hertz=params.MEL_MAX_HZ)

    #(data.shape[0]/params.SAMPLE_RATE*1000-25)/10+1 FRAMES x num_mel_bins

    # Frame features into examples
    # Each example is [100x513]->[100x64bins] (non-overlapping)
    features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS #frames every second
    example_window_length = int(round(params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(params.EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(
        log_mel,
        window_length=example_window_length,
        hop_length=example_hop_length)
    return log_mel_examples
Esempio n. 5
0
def segment_long_audio(wav_file):
    """ segment the long audio into short audios, with duration of params.SHORT_AUDIO_WINDOW_LENGTH_MIN,
    overlapped by params.SHORT_AUDIO_HOP_LENGTH_MIN
    Input: original long audio wav file
    Output: list of its short audios
    """
    sample_rate, wav_data = wavfile.read(wav_file) # single audio file
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    data = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    if len(data) == 0:
        return 0

    # Resample to the 16000
    if sample_rate != params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    # frame the long audio into shorter ones
    data_example_window_length = params.SHORT_AUDIO_WINDOW_LENGTH_MIN * 60 * params.SAMPLE_RATE
    data_example_hop_length = params.SHORT_AUDIO_HOP_LENGTH_MIN * 60 * params.SAMPLE_RATE
    data_examples = mel_features.frame(
        data,
        window_length=data_example_window_length,
        hop_length=data_example_hop_length)
    return data_examples
Esempio n. 6
0
def _waveform_to_mel_spectrogram_segments(data, sample_rate):
    """
    Converts audio from a single wav file into an array of examples for VGGish.

    Args:
        data: np.array of either one dimension (mono) or two dimensions
          (multi-channel, with the outer dimension representing channels).
          Each sample is generally expected to lie in the range [-1.0, +1.0],
          although this is not required. Shape is (num_frame, )
        sample_rate: Sample rate of data.

    Returns:
        3-D np.array of shape [num_examples, num_frames, num_bands] which represents
        a sequence of examples, each of which contains a patch of log mel
        spectrogram, covering num_frames frames of audio and num_bands mel frequency
        bands, where the frame length is mel_params.STFT_HOP_LENGTH_SECONDS.

    IMPORTANT: if data.shape < (80000, ) then log_mel_examples.shape=(0, 496, 64).
        The zero is problematic downstream, so code will have to check for that.
    """

    # Convert to mono if necessary.
    if len(data.shape) > 1:
        #print(f'DEBUG: audio channels before={data.shape}')
        data = np.mean(data, axis=1)
        #print(f'DEBUG: audio channels after={data.shape}')

    # Resample to the rate assumed by VGGish.
    if sample_rate != mel_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, mel_params.SAMPLE_RATE)

    # Compute log mel spectrogram features.
    log_mel = log_mel_spectrogram(data,
                                  audio_sample_rate=mel_params.SAMPLE_RATE,
                                  log_offset=mel_params.LOG_OFFSET,
                                  window_length_secs=mel_params.STFT_WINDOW_LENGTH_SECONDS,
                                  hop_length_secs=mel_params.STFT_HOP_LENGTH_SECONDS,
                                  num_mel_bins=mel_params.NUM_MEL_BINS,
                                  lower_edge_hertz=mel_params.MEL_MIN_HZ,
                                  upper_edge_hertz=mel_params.MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / mel_params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(mel_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(
        round(mel_params.EXAMPLE_HOP_SECONDS * features_sample_rate))

    # If log_mel.shape[0] < mel_params.NUM_FRAMES, log_mel_examples will return
    #   an array with log_mel_examples.shape[0] = 0
    log_mel_examples = frame(log_mel,
                             window_length=example_window_length,
                             hop_length=example_hop_length)

    # print(f'DEBUG: data.shape={data.shape}')
    # print(f'DEBUG: log_mel_examples.shape={log_mel_examples.shape}')
    if log_mel_examples.shape[0] == 0:
        print('\nWARNING: audio sample too short! Using all zeros for that example.\n')
    return log_mel_examples
Esempio n. 7
0
def waveform_to_examples(data, sample_rate):
  """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
  vprint('waveform_to_examples input data shape')
  vprint(data.shape)

  # Convert to mono.
  if len(data.shape) > 1:
    data = np.mean(data, axis=1)
  # Resample to the rate assumed by VGGish.
  if sample_rate != vggish_params.SAMPLE_RATE:
    data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

  vprint('waveform_to_examples resampled mono shape')
  vprint(data.shape)
  
  # Compute log mel spectrogram features.
  log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

  vprint('waveform_to_examples log_mel shape')
  vprint(log_mel.shape)

  # Frame features into examples.
  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
  example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
  example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
  log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)

  vprint('waveform_to_examples log_mel reshaped')
  vprint(log_mel_examples.shape)
  return log_mel_examples
Esempio n. 8
0
def waveform_to_examples(data, sample_rate):
  """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
  # Convert to mono.
  if len(data.shape) > 1:
    data = np.mean(data, axis=1)
  # Resample to the rate assumed by VGGish.
  if sample_rate != vggish_params.SAMPLE_RATE:
    data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

  # Compute log mel spectrogram features.
  log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

  # Frame features into examples.
  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
  example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
  example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
  log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)
  return log_mel_examples
Esempio n. 9
0
def wavform_to_frames(data, sample_rate):
    """Converts audio waveform into an array of windows to be linked with VGGish features"""
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS
    hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS
    audio_sample_rate = vggish_params.SAMPLE_RATE
    window_length_samples = int(round(audio_sample_rate * window_length_secs))
    hop_length_samples = int(round(audio_sample_rate * hop_length_secs))

    frames = mel_features.frame(data, window_length_samples,
                                hop_length_samples)

    return frames
Esempio n. 10
0
def wavfile_to_examples(wav_file):
    sample_rate, wav_data = wavfile.read(wav_file)
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    data = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    if len(data) == 0:
        return 0
    if sample_rate != params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, params.SAMPLE_RATE)

    # Compute log mel spectrogram features for each short audios (log FBANK)
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=params.SAMPLE_RATE,
        log_offset=params.LOG_OFFSET,
        window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=params.NUM_MEL_BINS, # here forced the num_mel_bins
        lower_edge_hertz=params.MEL_MIN_HZ,
        upper_edge_hertz=params.MEL_MAX_HZ)

    features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(round(params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(params.EXAMPLE_HOP_SECONDS * features_sample_rate))

    # added: zero pad the frame to expected frame number for each example log-mel FBANK
    if log_mel.shape[0] % params.NUM_FRAMES:
        pad_data = np.zeros((int(np.ceil(1.0*log_mel.shape[0]/params.NUM_FRAMES)*params.NUM_FRAMES),log_mel.shape[1]))
        pad_data[:log_mel.shape[0],:log_mel.shape[1]] = log_mel
        log_mel = pad_data

    log_mel_examples = mel_features.frame(
        log_mel,
        window_length=example_window_length,
        hop_length=example_hop_length)
    return log_mel_examples
Esempio n. 11
0
    frames_list = []
    embeddings_list = []

    for i in range(0, num_example_to_gen):
        #generate audio examples and get feature tensors for each
        example_waveform = generate_audio.gen_audio(2, 16000)
        examples_batch = vggish_input.waveform_to_examples(
            example_waveform, 16000)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})

        audio_frames = mel_features.frame(example_waveform, int(0.96 * 16000),
                                          int(0.96 * 16000))
        #audio = audio[0:num_frames_to_keep]
        frames_list.append(audio_frames)
        embeddings_list.append(embedding_batch)
        print('adding number ' + str(i))

        #print(embedding_batch)
        #postprocessed_batch = pproc.postprocess(embedding_batch)
        #print(postprocessed_batch)

    #100,000,000 floats should be about 200mb. That's about 6250*16000 = 100,000,000
    #Will size output into numpy arrays roughly 200mb each, later to be used as TFrecord objects which like to be around that size.

    #convert to numpy and write to disk
    frames_array = np.array(frames_list)
    frames_array = np.reshape(frames_array, (-1, frames_list[0].shape[1]))
def waveform_to_examples(data, sample_rate):
    """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    - Length of the audio_sample after padding.
    - 3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    ######################################################################
    olength = len(data)
    temp_data = []
    OVERLAP_SAMPLE_RATE = int(0.5 * vggish_params.SAMPLE_RATE)
    for i in range(0, len(data), OVERLAP_SAMPLE_RATE):
        end = i + vggish_params.SAMPLE_RATE
        chunk = data[i:min(end, len(data))]
        temp_data.extend(chunk)

    pad_length = vggish_params.SAMPLE_RATE - (len(temp_data) %
                                              OVERLAP_SAMPLE_RATE)
    temp_data = np.asarray(temp_data)
    # limit = int(np.ceil(2*len(data)/float(vggish_params.SAMPLE_RATE)))
    data = np.pad(temp_data, (0, pad_length), 'constant')

    ######################################################################
    # Compute log mel spectrogram features.

    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(
        round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(log_mel,
                                          window_length=example_window_length,
                                          hop_length=example_hop_length)
    return olength, len(data), log_mel_examples
Esempio n. 13
0
def waveform_to_examples(data, sample_rate, file_path):
    """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    # begin mod
    audio_sample_rate = vggish_params.SAMPLE_RATE
    log_offset = vggish_params.LOG_OFFSET
    window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS
    hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS
    num_mel_bins = vggish_params.NUM_MEL_BINS
    lower_edge_hertz = vggish_params.MEL_MIN_HZ
    upper_edge_hertz = vggish_params.MEL_MAX_HZ
    #end mod

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(
        round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(log_mel,
                                          window_length=example_window_length,
                                          hop_length=example_hop_length)

    output_csv_dict = {
        "file_name": os.path.basename(file_path),
        "audio_sample_rate": audio_sample_rate,
        "log_offset": log_offset,
        "window_length_secs": window_length_secs,
        "hop_length_secs": hop_length_secs,
        "num_mel_bins": num_mel_bins,
        "lower_edge_hertz": lower_edge_hertz,
        "log_mel": log_mel
    }

    #dict_to_csv(output_csv_dict)

    return output_csv_dict, log_mel_examples