Ejemplo n.º 1
0
def shorter_waveform_to_examples(data):
    """
    Compute the spectrogram for each short audios
    Input: short audio data
    Output: list of spectrograms in this short audio, eahch with params.EXAMPLE_WINDOW_SECONDS, hopped by params.EXAMPLE_HOP_SECONDS
    """
    # Compute log mel spectrogram features for each short audios
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=params.SAMPLE_RATE,
        log_offset=params.LOG_OFFSET,
        window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=params.NUM_MEL_BINS, # here forced the num_mel_bins
        lower_edge_hertz=params.MEL_MIN_HZ,
        upper_edge_hertz=params.MEL_MAX_HZ)

    #(data.shape[0]/params.SAMPLE_RATE*1000-25)/10+1 FRAMES x num_mel_bins

    # Frame features into examples
    # Each example is [100x513]->[100x64bins] (non-overlapping)
    features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS #frames every second
    example_window_length = int(round(params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(params.EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(
        log_mel,
        window_length=example_window_length,
        hop_length=example_hop_length)
    return log_mel_examples
Ejemplo n.º 2
0
def wavfile_to_examples(wav_file):
  
	sample_rate, wav_data = wavfile.read(wav_file)
	assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
	data = wav_data / 32768.0 # Convert to [-1.0, +1.0]

	# Convert to mono.
	if len(data.shape) > 1:
		data = np.mean(data, axis=1)
	# Resample to the rate assumed by VGGish.
	if sample_rate != SAMPLE_RATE:
		data = resampy.resample(data, sample_rate, SAMPLE_RATE)

	# Compute log mel spectrogram features.
	log_mel = mel_features.log_mel_spectrogram(data,
											audio_sample_rate= SAMPLE_RATE,
											log_offset= LOG_OFFSET,
											window_length_secs= STFT_WINDOW_LENGTH_SECONDS,
											hop_length_secs= STFT_HOP_LENGTH_SECONDS,
											num_mel_bins= NUM_MEL_BINS,
											lower_edge_hertz= MEL_MIN_HZ,
											upper_edge_hertz= MEL_MAX_HZ)

	# Frame features into examples.
	features_sample_rate = 1.0 /  STFT_HOP_LENGTH_SECONDS
	example_window_length = int(round( EXAMPLE_WINDOW_SECONDS * features_sample_rate))
	example_hop_length = int(round( EXAMPLE_HOP_SECONDS * features_sample_rate))
	log_mel_examples = mel_features.frame(log_mel,
										window_length=example_window_length,
	
										hop_length=example_hop_length)

	return log_mel_examples
Ejemplo n.º 3
0
def waveform_to_examples(data, sample_rate):
  # Convert to mono.
  if len(data.shape) > 1:
    data = np.mean(data, axis=1)

  # Compute log mel spectrogram features.
  log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

  # Frame features into examples.
  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
  example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
  example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
  log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)
  return log_mel_examples
Ejemplo n.º 4
0
def preprocess_sound(data, sample_rate):
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, SAMPLE_RATE)

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=SAMPLE_RATE,
        log_offset=LOG_OFFSET,
        window_length_secs=STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=NUM_MEL_BINS,
        lower_edge_hertz=MEL_MIN_HZ,
        upper_edge_hertz=MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(log_mel,
                                          window_length=example_window_length,
                                          hop_length=example_hop_length)
    return log_mel_examples
Ejemplo n.º 5
0
def _waveform_to_mel_spectrogram_segments(data, sample_rate):
    """
    Converts audio from a single wav file into an array of examples for VGGish.

    Args:
        data: np.array of either one dimension (mono) or two dimensions
          (multi-channel, with the outer dimension representing channels).
          Each sample is generally expected to lie in the range [-1.0, +1.0],
          although this is not required. Shape is (num_frame, )
        sample_rate: Sample rate of data.

    Returns:
        3-D np.array of shape [num_examples, num_frames, num_bands] which represents
        a sequence of examples, each of which contains a patch of log mel
        spectrogram, covering num_frames frames of audio and num_bands mel frequency
        bands, where the frame length is mel_params.STFT_HOP_LENGTH_SECONDS.

    IMPORTANT: if data.shape < (80000, ) then log_mel_examples.shape=(0, 496, 64).
        The zero is problematic downstream, so code will have to check for that.
    """

    # Convert to mono if necessary.
    if len(data.shape) > 1:
        #print(f'DEBUG: audio channels before={data.shape}')
        data = np.mean(data, axis=1)
        #print(f'DEBUG: audio channels after={data.shape}')

    # Resample to the rate assumed by VGGish.
    if sample_rate != mel_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, mel_params.SAMPLE_RATE)

    # Compute log mel spectrogram features.
    log_mel = log_mel_spectrogram(data,
                                  audio_sample_rate=mel_params.SAMPLE_RATE,
                                  log_offset=mel_params.LOG_OFFSET,
                                  window_length_secs=mel_params.STFT_WINDOW_LENGTH_SECONDS,
                                  hop_length_secs=mel_params.STFT_HOP_LENGTH_SECONDS,
                                  num_mel_bins=mel_params.NUM_MEL_BINS,
                                  lower_edge_hertz=mel_params.MEL_MIN_HZ,
                                  upper_edge_hertz=mel_params.MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / mel_params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(mel_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(
        round(mel_params.EXAMPLE_HOP_SECONDS * features_sample_rate))

    # If log_mel.shape[0] < mel_params.NUM_FRAMES, log_mel_examples will return
    #   an array with log_mel_examples.shape[0] = 0
    log_mel_examples = frame(log_mel,
                             window_length=example_window_length,
                             hop_length=example_hop_length)

    # print(f'DEBUG: data.shape={data.shape}')
    # print(f'DEBUG: log_mel_examples.shape={log_mel_examples.shape}')
    if log_mel_examples.shape[0] == 0:
        print('\nWARNING: audio sample too short! Using all zeros for that example.\n')
    return log_mel_examples
Ejemplo n.º 6
0
def waveform_to_examples(data, sample_rate):
  """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
  vprint('waveform_to_examples input data shape')
  vprint(data.shape)

  # Convert to mono.
  if len(data.shape) > 1:
    data = np.mean(data, axis=1)
  # Resample to the rate assumed by VGGish.
  if sample_rate != vggish_params.SAMPLE_RATE:
    data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

  vprint('waveform_to_examples resampled mono shape')
  vprint(data.shape)
  
  # Compute log mel spectrogram features.
  log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

  vprint('waveform_to_examples log_mel shape')
  vprint(log_mel.shape)

  # Frame features into examples.
  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
  example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
  example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
  log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)

  vprint('waveform_to_examples log_mel reshaped')
  vprint(log_mel_examples.shape)
  return log_mel_examples
Ejemplo n.º 7
0
def getmelspectrogram(src):
    spectrogram = 30 * (
        mel_features.log_mel_spectrogram(src,
                                         audio_sample_rate=16000,
                                         log_offset=0.001,
                                         window_length_secs=0.025,
                                         hop_length_secs=0.010,
                                         num_mel_bins=32,
                                         lower_edge_hertz=60,
                                         upper_edge_hertz=3800) - np.log(1e-3))
    spectrogram = np.array(np.ceil(spectrogram), dtype=np.uint8)
    return spectrogram
Ejemplo n.º 8
0
 def _compute_spectrogram(self, audio_samples, audio_sample_rate_hz):
     """Compute log-mel spectrogram and scale it to uint8."""
     samples = audio_samples.flatten() / float(2**15)
     spectrogram = 30 * (mel_features.log_mel_spectrogram(
         samples,
         audio_sample_rate_hz,
         log_offset=0.001,
         window_length_secs=self.spectrogram_window_length_seconds,
         hop_length_secs=self.spectrogram_hop_length_seconds,
         num_mel_bins=self.num_mel_bins,
         lower_edge_hertz=60,
         upper_edge_hertz=3800) - np.log(1e-3))
     return spectrogram
Ejemplo n.º 9
0
 def generate_mel_spectogram(self, config, filtered_signal):
     '''
 mel = librosa.feature.melspectrogram(y=filtered_signal, 
                                       sr = config["pre_process"]["sample_rate"], 
                                       n_mels = config["pre_process"]["n_mels"], 
                                       fmax=10000 , 
                                       n_fft = config["pre_process"]["n_fft"], 
                                       hop_length= config["pre_process"]["hop_length"])
 '''
     mel = log_mel_spectrogram(
         filtered_signal,
         audio_sample_rate=config["pre_process"]["sample_rate"],
         log_offset=0.01)
     return mel
Ejemplo n.º 10
0
def waveform_to_examples(data, sample_rate):
  """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
  # Convert to mono.
  if len(data.shape) > 1:
    data = np.mean(data, axis=1)
  # Resample to the rate assumed by VGGish.
  if sample_rate != vggish_params.SAMPLE_RATE:
    data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

  # Compute log mel spectrogram features.
  log_mel = mel_features.log_mel_spectrogram(
      data,
      audio_sample_rate=vggish_params.SAMPLE_RATE,
      log_offset=vggish_params.LOG_OFFSET,
      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
      num_mel_bins=vggish_params.NUM_MEL_BINS,
      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
      upper_edge_hertz=vggish_params.MEL_MAX_HZ)

  # Frame features into examples.
  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
  example_window_length = int(round(
      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
  example_hop_length = int(round(
      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
  log_mel_examples = mel_features.frame(
      log_mel,
      window_length=example_window_length,
      hop_length=example_hop_length)
  return log_mel_examples
Ejemplo n.º 11
0
def wav_to_mel(filename, parser, model):

    SAMPLE_RATE = parser.getint('mel', 'SAMPLE_RATE')
    LOG_OFFSET = parser.getfloat('mel', 'LOG_OFFSET')
    STFT_WINDOW_LENGTH_SECONDS = parser.getfloat('mel',
                                                 'STFT_WINDOW_LENGTH_SECONDS')
    STFT_HOP_LENGTH_SECONDS = parser.getfloat('mel', 'STFT_HOP_LENGTH_SECONDS')
    MEL_MIN_HZ = parser.getint('mel', 'MEL_MIN_HZ')
    MEL_MAX_HZ = parser.getint('mel', 'MEL_MAX_HZ')

    if (model == 'teacher'):
        NUM_BANDS = parser.getint('mel', 'NUM_BANDS_TEACHER')
        NUM_MEL_BINS = NUM_BANDS

    else:
        NUM_BANDS = parser.getint('mel', 'NUM_BANDS_STUDENT')
        NUM_MEL_BINS = NUM_BANDS

    y, sr = librosa.load(filename, mono=True, sr=None)

    if y.shape[0] < sr * 1 and y.shape[0] > sr * 0.0:
        y = librosa.util.fix_length(y, int(sr * 1.01))

    y = y.T

    data = y
    sample_rate = sr

    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, SAMPLE_RATE)
    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=SAMPLE_RATE,
        log_offset=LOG_OFFSET,
        window_length_secs=STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=NUM_MEL_BINS,
        lower_edge_hertz=MEL_MIN_HZ,
        upper_edge_hertz=MEL_MAX_HZ)

    return log_mel
Ejemplo n.º 12
0
def extract_alt_logmel(path_file,
                       frame_size=0.025,
                       frame_stride=0.010,
                       normalize=True):
    """This function extracts logmel features using the provided logmel feature extraction
        code included in the google audioset (vggish) repository. Main difference is it uses 
        Hann Window instead of Hamming window
    """
    sample_rate, signal = wavfile.read(path_file)
    filter_banks = log_mel_spectrogram(signal,
                                       audio_sample_rate=sample_rate,
                                       log_offset=0.0,
                                       window_length_secs=frame_size,
                                       hop_length_secs=frame_stride)
    if normalize:
        filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
    # print (np.mean(filter_banks, axis=0))
    # print (filter_banks.shape)
    return filter_banks
Ejemplo n.º 13
0
def wavedata_to_log_melspectrogram(wav_data, sample_rate):
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    data = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    return log_mel
def waveform_to_examples(data, sample_rate):
    """Converts audio waveform into an array of examples for VGGish.

    Args:
      data: np.array of either one dimension (mono) or two dimensions
        (multi-channel, with the outer dimension representing channels).
        Each sample is generally expected to lie in the range [-1.0, +1.0],
        although this is not required.
      sample_rate: Sample rate of data.

    Returns:
      3-D np.array of shape [num_examples, num_frames, num_bands] which represents
      a sequence of examples, each of which contains a patch of log mel
      spectrogram, covering num_frames frames of audio and num_bands mel frequency
      bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
    """
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    log_mel = np.asarray(
        torchvision.transforms.Resize((96, 64))(Image.fromarray(log_mel)))

    return np.array([log_mel])
Ejemplo n.º 15
0
def wavfile_to_examples(wav_file):
    sample_rate, wav_data = wavfile.read(wav_file)
    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
    data = wav_data / 32768.0  # Convert to [-1.0, +1.0]
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    if len(data) == 0:
        return 0
    if sample_rate != params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, params.SAMPLE_RATE)

    # Compute log mel spectrogram features for each short audios (log FBANK)
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=params.SAMPLE_RATE,
        log_offset=params.LOG_OFFSET,
        window_length_secs=params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=params.NUM_MEL_BINS, # here forced the num_mel_bins
        lower_edge_hertz=params.MEL_MIN_HZ,
        upper_edge_hertz=params.MEL_MAX_HZ)

    features_sample_rate = 1.0 / params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(round(params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(round(params.EXAMPLE_HOP_SECONDS * features_sample_rate))

    # added: zero pad the frame to expected frame number for each example log-mel FBANK
    if log_mel.shape[0] % params.NUM_FRAMES:
        pad_data = np.zeros((int(np.ceil(1.0*log_mel.shape[0]/params.NUM_FRAMES)*params.NUM_FRAMES),log_mel.shape[1]))
        pad_data[:log_mel.shape[0],:log_mel.shape[1]] = log_mel
        log_mel = pad_data

    log_mel_examples = mel_features.frame(
        log_mel,
        window_length=example_window_length,
        hop_length=example_hop_length)
    return log_mel_examples
Ejemplo n.º 16
0
def waveform_to_examples(data, sample_rate):
    """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    - Length of the audio_sample after padding.
    - 3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    ######################################################################
    olength = len(data)
    temp_data = []
    OVERLAP_SAMPLE_RATE = int(0.5 * vggish_params.SAMPLE_RATE)
    for i in range(0, len(data), OVERLAP_SAMPLE_RATE):
        end = i + vggish_params.SAMPLE_RATE
        chunk = data[i:min(end, len(data))]
        temp_data.extend(chunk)

    pad_length = vggish_params.SAMPLE_RATE - (len(temp_data) %
                                              OVERLAP_SAMPLE_RATE)
    temp_data = np.asarray(temp_data)
    # limit = int(np.ceil(2*len(data)/float(vggish_params.SAMPLE_RATE)))
    data = np.pad(temp_data, (0, pad_length), 'constant')

    ######################################################################
    # Compute log mel spectrogram features.

    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(
        round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(log_mel,
                                          window_length=example_window_length,
                                          hop_length=example_hop_length)
    return olength, len(data), log_mel_examples
Ejemplo n.º 17
0
def waveform_to_examples(data, sample_rate, file_path):
    """Converts audio waveform into an array of examples for VGGish.

  Args:
    data: np.array of either one dimension (mono) or two dimensions
      (multi-channel, with the outer dimension representing channels).
      Each sample is generally expected to lie in the range [-1.0, +1.0],
      although this is not required.
    sample_rate: Sample rate of data.

  Returns:
    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
    a sequence of examples, each of which contains a patch of log mel
    spectrogram, covering num_frames frames of audio and num_bands mel frequency
    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
  """
    # Convert to mono.
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    # Resample to the rate assumed by VGGish.
    if sample_rate != vggish_params.SAMPLE_RATE:
        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

    # begin mod
    audio_sample_rate = vggish_params.SAMPLE_RATE
    log_offset = vggish_params.LOG_OFFSET
    window_length_secs = vggish_params.STFT_WINDOW_LENGTH_SECONDS
    hop_length_secs = vggish_params.STFT_HOP_LENGTH_SECONDS
    num_mel_bins = vggish_params.NUM_MEL_BINS
    lower_edge_hertz = vggish_params.MEL_MIN_HZ
    upper_edge_hertz = vggish_params.MEL_MAX_HZ
    #end mod

    # Compute log mel spectrogram features.
    log_mel = mel_features.log_mel_spectrogram(
        data,
        audio_sample_rate=vggish_params.SAMPLE_RATE,
        log_offset=vggish_params.LOG_OFFSET,
        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
        num_mel_bins=vggish_params.NUM_MEL_BINS,
        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
        upper_edge_hertz=vggish_params.MEL_MAX_HZ)

    # Frame features into examples.
    features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
    example_window_length = int(
        round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
    example_hop_length = int(
        round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
    log_mel_examples = mel_features.frame(log_mel,
                                          window_length=example_window_length,
                                          hop_length=example_hop_length)

    output_csv_dict = {
        "file_name": os.path.basename(file_path),
        "audio_sample_rate": audio_sample_rate,
        "log_offset": log_offset,
        "window_length_secs": window_length_secs,
        "hop_length_secs": hop_length_secs,
        "num_mel_bins": num_mel_bins,
        "lower_edge_hertz": lower_edge_hertz,
        "log_mel": log_mel
    }

    #dict_to_csv(output_csv_dict)

    return output_csv_dict, log_mel_examples