Beispiel #1
0
def audiofile_to_features(wav_filename):
    samples = tf.io.read_file(wav_filename)
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    features, features_len = samples_to_mfccs(decoded.audio,
                                              decoded.sample_rate)

    return features, features_len
Beispiel #2
0
def audiofile_to_features(wav_filename, train_phase=False):
    samples = tf.io.read_file(wav_filename)
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    return audio_to_features(decoded.audio,
                             decoded.sample_rate,
                             train_phase=train_phase,
                             sample_id=wav_filename)
Beispiel #3
0
def audiofile_to_features():
    samples = tf.placeholder(tf.string, name='input')
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    features, features_len = samples_to_mfccs(decoded.audio,
                                              decoded.sample_rate)

    return features, features_len, decoded.sample_rate
Beispiel #4
0
 def _test_dimensions(self, audio):
     pb = self.audio("k488", audio, 44100)
     self.assertEqual(1, len(pb.value))
     results = tensor_util.make_ndarray(pb.value[0].tensor)
     for i, (encoded, _) in enumerate(results):
         decoded, _ = audio_ops.decode_wav(encoded)
         self.assertEqual(audio[i].shape, decoded.shape)
Beispiel #5
0
 def load_audio(self, audio_path):
     samples = tf.io.read_file(audio_path)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
     audio = decoded.audio.eval(session=self.sess)
     rate = decoded.sample_rate.eval(session=self.sess)
     print('decoded.audio:', audio.shape)
     print('decoded.sample_rate:', rate)
     return audio
Beispiel #6
0
def audiofile_to_features(wav_filename, clock=0.0, train_phase=False, augmentations=None):
    samples = tf.io.read_file(wav_filename)
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    return audio_to_features(decoded.audio,
                             decoded.sample_rate,
                             clock=clock,
                             train_phase=train_phase,
                             augmentations=augmentations,
                             sample_id=wav_filename)
def get_deepspeech_mfccs(samples, sample_rate=16000):
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    spectrogram = contrib_audio.audio_spectrogram(decoded.audio,
                                                  window_size=512,
                                                  stride=320,
                                                  magnitude_squared=True)
    return contrib_audio.mfcc(spectrogram=spectrogram,
                              sample_rate=decoded.sample_rate,
                              dct_coefficient_count=26,
                              upper_frequency_limit=sample_rate / 2)
Beispiel #8
0
def DecodeWav(input_bytes):
    """Decode a wav file from its contents.

  Args:
    input_bytes: a byte array or Tensor with the wav file contents.

  Returns:
    A pair of Tensor for sample rate, decoded samples.
  """
    result = audio_ops.decode_wav(input_bytes)
    return result.sample_rate, result.audio
Beispiel #9
0
 def test_wav_format_roundtrip(self):
     audio = self._generate_audio(c=1)
     pb = self.audio("k488", audio, 44100)
     encoded = tensor_util.make_ndarray(pb.value[0].tensor)
     decoded, sample_rate = audio_ops.decode_wav(encoded.flat[0])
     # WAV roundtrip goes from float32 to int16 and back, so expect some
     # precision loss, but not more than 2 applications of rounding error from
     # mapping the range [-1.0, 1.0] to 2^16.
     epsilon = 2 * 2.0 / (2**16)
     self.assertAllClose(audio[0], decoded, atol=epsilon)
     self.assertEqual(44100, sample_rate.numpy())
Beispiel #10
0
def audiofile_to_features(wav_filename, train_phase=False):
    samples = tf.io.read_file(wav_filename)
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    features, features_len = samples_to_mfccs(decoded.audio, decoded.sample_rate, train_phase=train_phase)

    if train_phase:
        if FLAGS.data_aug_features_multiplicative > 0:
            features = features*tf.random.normal(mean=1, stddev=FLAGS.data_aug_features_multiplicative, shape=tf.shape(features))

        if FLAGS.data_aug_features_additive > 0:
            features = features+tf.random.normal(mean=0.0, stddev=FLAGS.data_aug_features_additive, shape=tf.shape(features))

    return features, features_len
Beispiel #11
0
    def get_unprocessed_data(self, how_many, model_settings, mode):
        """Retrieve sample data for the given partition, with no transformations.

    Args:
      how_many: Desired number of samples to return. -1 means the entire
        contents of this partition.
      model_settings: Information about the current model being trained.
      mode: Which partition to use, must be 'training', 'validation', or
        'testing'.

    Returns:
      List of sample data for the samples, and list of labels in one-hot form.
    """
        candidates = self.data_index[mode]
        if how_many == -1:
            sample_count = len(candidates)
        else:
            sample_count = how_many
        desired_samples = model_settings['desired_samples']
        words_list = self.words_list
        data = np.zeros((sample_count, desired_samples))
        labels = []
        with tf.Session(graph=tf.Graph()) as sess:
            wav_filename_placeholder = tf.placeholder(tf.string, [])
            wav_loader = io_ops.read_file(wav_filename_placeholder)
            wav_decoder = audio_ops.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
            foreground_volume_placeholder = tf.placeholder(tf.float32, [])
            scaled_foreground = tf.multiply(wav_decoder.audio,
                                            foreground_volume_placeholder)
            for i in range(sample_count):
                if how_many == -1:
                    sample_index = i
                else:
                    sample_index = np.random.randint(len(candidates))
                sample = candidates[sample_index]
                input_dict = {wav_filename_placeholder: sample['file']}
                if sample['label'] == SILENCE_LABEL:
                    input_dict[foreground_volume_placeholder] = 0
                else:
                    input_dict[foreground_volume_placeholder] = 1
                data[i, :] = sess.run(scaled_foreground,
                                      feed_dict=input_dict).flatten()
                label_index = self.word_to_index[sample['label']]
                labels.append(words_list[label_index])
        return data, labels
Beispiel #12
0
def load_wav_file(filename):
    """Loads an audio file and returns a float PCM-encoded array of samples.

  Args:
    filename: Path to the .wav file to load.

  Returns:
    Numpy array holding the sample data as floats between -1.0 and 1.0.
  """
    with tf.Session(graph=tf.Graph()) as sess:
        wav_filename_placeholder = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(wav_filename_placeholder)
        wav_decoder = audio_ops.decode_wav(wav_loader, desired_channels=1)
        return sess.run(wav_decoder,
                        feed_dict={
                            wav_filename_placeholder: filename
                        }).audio.flatten()
def load_wav_file(wav_filename, desired_samples):
    """
    Loads and decodes a given a 16bit PCM wav file to a float tensor
    Args:
        wav_filename:  16bit PCM wav filename of audio
        desired_samples: number of samples from the audio fiel.

    Returns:
        Tuple consisting of the decoded audio and sample rate
    """
    wav_file = tf.io.read_file(wav_filename)  # binary file

    # *Notes*, this api has been changed in tf2.5 as tf.audio.decode_wav, should be test.
    decoded_wav = audio_ops.decode_wav(wav_file,
                                       desired_channels=1,
                                       desired_samples=desired_samples)

    return decoded_wav.audio, decoded_wav.sample_rate
Beispiel #14
0
def load_wav_file(wav_filename, desired_samples):
    """Loads and then decodes a given 16bit PCM wav file.

    Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.

    Args:
        wav_filename: 16bit PCM wav file to load.
        desired_samples: Number of samples wanted from the audio file.

    Returns:
        Tuple consisting of the decoded audio and sample rate.
    """
    wav_file = tf.io.read_file(wav_filename)
    decoded_wav = audio_ops.decode_wav(wav_file,
                                       desired_channels=1,
                                       desired_samples=desired_samples)

    return decoded_wav.audio, decoded_wav.sample_rate
Beispiel #15
0
    def prepare_background_data(self):
        """Searches a folder for background noise audio, and loads it into memory.

    It's expected that the background audio samples will be in a subdirectory
    named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
    the sample rate of the training data, but can be much longer in duration.

    If the '_background_noise_' folder doesn't exist at all, this isn't an
    error, it's just taken to mean that no background noise augmentation should
    be used. If the folder does exist, but it's empty, that's treated as an
    error.

    Returns:
      List of raw PCM-encoded audio samples of background noise.

    Raises:
      Exception: If files aren't found in the folder.
    """

        ### need to refactor this to use csv files if background noise is used in future

        self.background_data = []
        background_dir = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME)
        if not os.path.exists(background_dir):
            return self.background_data
        with tf.Session(graph=tf.Graph()) as sess:
            wav_filename_placeholder = tf.placeholder(tf.string, [])
            wav_loader = io_ops.read_file(wav_filename_placeholder)
            wav_decoder = audio_ops.decode_wav(wav_loader, desired_channels=1)
            search_path = os.path.join(self.data_dir,
                                       BACKGROUND_NOISE_DIR_NAME, '*.wav')
            for wav_path in gfile.Glob(search_path):
                wav_data = sess.run(wav_decoder,
                                    feed_dict={
                                        wav_filename_placeholder: wav_path
                                    }).audio.flatten()
                self.background_data.append(wav_data)
            if not self.background_data:
                raise Exception('No background wav files were found in ' +
                                search_path)
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Invert MFCCs to audio.')
    parser.add_argument('input_file', help='Path to .pkl / .wav input file')
    parser.add_argument('output_file', help='Path to .wav output file')
    parser.add_argument('--input_type',
                        default='mfccs',
                        help='Input type: logmel / mfccs')
    args = parser.parse_args()

    # Load from file
    ext = os.path.splitext(args.input_file)[-1]
    print("Reading from file...")
    if ext == '.wav':
        samples = tf.io.read_file(args.input_file)
        decoded = contrib_audio.decode_wav(samples, desired_channels=1)
        audio = decoded.audio
        if args.input_type == 'mfccs':
            inp = audio2mfccs(audio)
        elif args.input_type == 'logmel':
            inp = audio2logmel(audio)
        elif args.input_type == 'spectrograms':
            inp = audio2spectrograms(audio)
        else:
            raise ValueError("%s is not supported" % args.input_type)
    elif ext == '.pkl':
        audio = None
        with open(args.input_file, 'rb') as f:
            x_r = pkl.load(f)
        x_r = tf.squeeze(tf.constant(x_r), 0)
        inp = x_r
Beispiel #17
0
def create_inference_graph(
        wanted_words, sample_rate, nchannels, clip_duration_ms, clip_stride_ms,
        representation, window_size_ms, window_stride_ms, nwindows,
        dct_coefficient_count, filterbank_channel_count, model_architecture,
        filter_counts, filter_sizes, final_filter_len, dropout_prob,
        batch_size, dilate_after_layer, stride_after_layer, connection_type,
        silence_percentage, unknown_percentage):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','),
                                               silence_percentage,
                                               unknown_percentage)
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, nchannels, clip_duration_ms,
        representation, window_size_ms, window_stride_ms, nwindows,
        dct_coefficient_count, filterbank_channel_count, filter_counts,
        filter_sizes, final_filter_len, dropout_prob, batch_size,
        dilate_after_layer, stride_after_layer, connection_type)

    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = audio_ops.decode_wav(
        wav_data_placeholder,
        desired_channels=nchannels,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrograms = []
    for ichannel in range(nchannels):
        spectrograms.append(
            audio_ops.audio_spectrogram(
                decoded_sample_data.audio,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True))
    spectrogram = tf.stack(spectrograms, -1)
    mfccs = []
    for ichannel in range(nchannels):
        mfccs.append(
            audio_ops.mfcc(spectrograms[ichannel],
                           decoded_sample_data.sample_rate,
                           upper_frequency_limit=sample_rate // 2,
                           filterbank_channel_count=filterbank_channel_count,
                           dct_coefficient_count=dct_coefficient_count))
    mfcc = tf.stack(mfccs, -1)

    if representation == 'waveform':
        fingerprint_input = decoded_sample_data.audio
    elif representation == 'spectrogram':
        fingerprint_input = spectrogram
    elif representation == 'mel-cepstrum':
        fingerprint_input = mfcc

    reshaped_input = tf.reshape(fingerprint_input,
                                [-1, model_settings['fingerprint_size']])

    hidden_layers, final = models.create_model(
        reshaped_input,
        model_settings,
        model_architecture,
        is_training=False,
        runtime_settings=runtime_settings)

    # Create an output to use for inference.
    for i in range(len(hidden_layers)):
        tf.identity(hidden_layers[i], name='hidden_layer' + str(i))
    tf.nn.softmax(final, name='output_layer')
Beispiel #18
0
 def fn2audio(fn):
     samples = tf.io.read_file(fn)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
     return decoded.audio