Python decode_wav Beispiele, tensorflow.python.ops.gen_audio_ops.decode_wav Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: feeding.py Projekt: saber5433/DeepSpeech-1

def audiofile_to_features(wav_filename):
    samples = tf.io.read_file(wav_filename)
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    features, features_len = samples_to_mfccs(decoded.audio,
                                              decoded.sample_rate)

    return features, features_len

Beispiel #2

0

Datei anzeigen

def audiofile_to_features(wav_filename, train_phase=False):
    samples = tf.io.read_file(wav_filename)
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    return audio_to_features(decoded.audio,
                             decoded.sample_rate,
                             train_phase=train_phase,
                             sample_id=wav_filename)

Beispiel #3

0

Datei anzeigen

def audiofile_to_features():
    samples = tf.placeholder(tf.string, name='input')
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    features, features_len = samples_to_mfccs(decoded.audio,
                                              decoded.sample_rate)

    return features, features_len, decoded.sample_rate

Beispiel #4

0

Datei anzeigen

Datei: summary_test.py Projekt: jverre/tensorboard-1

 def _test_dimensions(self, audio):
     pb = self.audio("k488", audio, 44100)
     self.assertEqual(1, len(pb.value))
     results = tensor_util.make_ndarray(pb.value[0].tensor)
     for i, (encoded, _) in enumerate(results):
         decoded, _ = audio_ops.decode_wav(encoded)
         self.assertEqual(audio[i].shape, decoded.shape)

Beispiel #5

0

Datei anzeigen

 def load_audio(self, audio_path):
     samples = tf.io.read_file(audio_path)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
     audio = decoded.audio.eval(session=self.sess)
     rate = decoded.sample_rate.eval(session=self.sess)
     print('decoded.audio:', audio.shape)
     print('decoded.sample_rate:', rate)
     return audio

Beispiel #6

0

Datei anzeigen

Datei: feeding.py Projekt: NanoNabla/coqui-ai_STT

def audiofile_to_features(wav_filename, clock=0.0, train_phase=False, augmentations=None):
    samples = tf.io.read_file(wav_filename)
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    return audio_to_features(decoded.audio,
                             decoded.sample_rate,
                             clock=clock,
                             train_phase=train_phase,
                             augmentations=augmentations,
                             sample_id=wav_filename)

Beispiel #7

0

Datei anzeigen

Datei: mfcc2audio.py Projekt: googleinterns/deepspeech-reconstruction

def get_deepspeech_mfccs(samples, sample_rate=16000):
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    spectrogram = contrib_audio.audio_spectrogram(decoded.audio,
                                                  window_size=512,
                                                  stride=320,
                                                  magnitude_squared=True)
    return contrib_audio.mfcc(spectrogram=spectrogram,
                              sample_rate=decoded.sample_rate,
                              dct_coefficient_count=26,
                              upper_frequency_limit=sample_rate / 2)

Beispiel #8

0

Datei anzeigen

def DecodeWav(input_bytes):
    """Decode a wav file from its contents.

  Args:
    input_bytes: a byte array or Tensor with the wav file contents.

  Returns:
    A pair of Tensor for sample rate, decoded samples.
  """
    result = audio_ops.decode_wav(input_bytes)
    return result.sample_rate, result.audio

Beispiel #9

0

Datei anzeigen

Datei: summary_test.py Projekt: jverre/tensorboard-1

 def test_wav_format_roundtrip(self):
     audio = self._generate_audio(c=1)
     pb = self.audio("k488", audio, 44100)
     encoded = tensor_util.make_ndarray(pb.value[0].tensor)
     decoded, sample_rate = audio_ops.decode_wav(encoded.flat[0])
     # WAV roundtrip goes from float32 to int16 and back, so expect some
     # precision loss, but not more than 2 applications of rounding error from
     # mapping the range [-1.0, 1.0] to 2^16.
     epsilon = 2 * 2.0 / (2**16)
     self.assertAllClose(audio[0], decoded, atol=epsilon)
     self.assertEqual(44100, sample_rate.numpy())

Beispiel #10

0

Datei anzeigen

def audiofile_to_features(wav_filename, train_phase=False):
    samples = tf.io.read_file(wav_filename)
    decoded = contrib_audio.decode_wav(samples, desired_channels=1)
    features, features_len = samples_to_mfccs(decoded.audio, decoded.sample_rate, train_phase=train_phase)

    if train_phase:
        if FLAGS.data_aug_features_multiplicative > 0:
            features = features*tf.random.normal(mean=1, stddev=FLAGS.data_aug_features_multiplicative, shape=tf.shape(features))

        if FLAGS.data_aug_features_additive > 0:
            features = features+tf.random.normal(mean=0.0, stddev=FLAGS.data_aug_features_additive, shape=tf.shape(features))

    return features, features_len

Beispiel #11

0

Datei anzeigen

    def get_unprocessed_data(self, how_many, model_settings, mode):
        """Retrieve sample data for the given partition, with no transformations.

    Args:
      how_many: Desired number of samples to return. -1 means the entire
        contents of this partition.
      model_settings: Information about the current model being trained.
      mode: Which partition to use, must be 'training', 'validation', or
        'testing'.

    Returns:
      List of sample data for the samples, and list of labels in one-hot form.
    """
        candidates = self.data_index[mode]
        if how_many == -1:
            sample_count = len(candidates)
        else:
            sample_count = how_many
        desired_samples = model_settings['desired_samples']
        words_list = self.words_list
        data = np.zeros((sample_count, desired_samples))
        labels = []
        with tf.Session(graph=tf.Graph()) as sess:
            wav_filename_placeholder = tf.placeholder(tf.string, [])
            wav_loader = io_ops.read_file(wav_filename_placeholder)
            wav_decoder = audio_ops.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
            foreground_volume_placeholder = tf.placeholder(tf.float32, [])
            scaled_foreground = tf.multiply(wav_decoder.audio,
                                            foreground_volume_placeholder)
            for i in range(sample_count):
                if how_many == -1:
                    sample_index = i
                else:
                    sample_index = np.random.randint(len(candidates))
                sample = candidates[sample_index]
                input_dict = {wav_filename_placeholder: sample['file']}
                if sample['label'] == SILENCE_LABEL:
                    input_dict[foreground_volume_placeholder] = 0
                else:
                    input_dict[foreground_volume_placeholder] = 1
                data[i, :] = sess.run(scaled_foreground,
                                      feed_dict=input_dict).flatten()
                label_index = self.word_to_index[sample['label']]
                labels.append(words_list[label_index])
        return data, labels

Beispiel #12

0

Datei anzeigen

def load_wav_file(filename):
    """Loads an audio file and returns a float PCM-encoded array of samples.

  Args:
    filename: Path to the .wav file to load.

  Returns:
    Numpy array holding the sample data as floats between -1.0 and 1.0.
  """
    with tf.Session(graph=tf.Graph()) as sess:
        wav_filename_placeholder = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(wav_filename_placeholder)
        wav_decoder = audio_ops.decode_wav(wav_loader, desired_channels=1)
        return sess.run(wav_decoder,
                        feed_dict={
                            wav_filename_placeholder: filename
                        }).audio.flatten()

Beispiel #13

0

Datei anzeigen

Datei: data_process.py Projekt: syw2014/Speech-Processing

def load_wav_file(wav_filename, desired_samples):
    """
    Loads and decodes a given a 16bit PCM wav file to a float tensor
    Args:
        wav_filename:  16bit PCM wav filename of audio
        desired_samples: number of samples from the audio fiel.

    Returns:
        Tuple consisting of the decoded audio and sample rate
    """
    wav_file = tf.io.read_file(wav_filename)  # binary file

    # *Notes*, this api has been changed in tf2.5 as tf.audio.decode_wav, should be test.
    decoded_wav = audio_ops.decode_wav(wav_file,
                                       desired_channels=1,
                                       desired_samples=desired_samples)

    return decoded_wav.audio, decoded_wav.sample_rate

Beispiel #14

0

Datei anzeigen

def load_wav_file(wav_filename, desired_samples):
    """Loads and then decodes a given 16bit PCM wav file.

    Decoded audio is scaled to the range [-1, 1] and padded or cropped to the desired number of samples.

    Args:
        wav_filename: 16bit PCM wav file to load.
        desired_samples: Number of samples wanted from the audio file.

    Returns:
        Tuple consisting of the decoded audio and sample rate.
    """
    wav_file = tf.io.read_file(wav_filename)
    decoded_wav = audio_ops.decode_wav(wav_file,
                                       desired_channels=1,
                                       desired_samples=desired_samples)

    return decoded_wav.audio, decoded_wav.sample_rate

Beispiel #15

0

Datei anzeigen

    def prepare_background_data(self):
        """Searches a folder for background noise audio, and loads it into memory.

    It's expected that the background audio samples will be in a subdirectory
    named '_background_noise_' inside the 'data_dir' folder, as .wavs that match
    the sample rate of the training data, but can be much longer in duration.

    If the '_background_noise_' folder doesn't exist at all, this isn't an
    error, it's just taken to mean that no background noise augmentation should
    be used. If the folder does exist, but it's empty, that's treated as an
    error.

    Returns:
      List of raw PCM-encoded audio samples of background noise.

    Raises:
      Exception: If files aren't found in the folder.
    """

        ### need to refactor this to use csv files if background noise is used in future

        self.background_data = []
        background_dir = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME)
        if not os.path.exists(background_dir):
            return self.background_data
        with tf.Session(graph=tf.Graph()) as sess:
            wav_filename_placeholder = tf.placeholder(tf.string, [])
            wav_loader = io_ops.read_file(wav_filename_placeholder)
            wav_decoder = audio_ops.decode_wav(wav_loader, desired_channels=1)
            search_path = os.path.join(self.data_dir,
                                       BACKGROUND_NOISE_DIR_NAME, '*.wav')
            for wav_path in gfile.Glob(search_path):
                wav_data = sess.run(wav_decoder,
                                    feed_dict={
                                        wav_filename_placeholder: wav_path
                                    }).audio.flatten()
                self.background_data.append(wav_data)
            if not self.background_data:
                raise Exception('No background wav files were found in ' +
                                search_path)

Beispiel #16

0

Datei anzeigen

Datei: mfcc2audio.py Projekt: googleinterns/deepspeech-reconstruction

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Invert MFCCs to audio.')
    parser.add_argument('input_file', help='Path to .pkl / .wav input file')
    parser.add_argument('output_file', help='Path to .wav output file')
    parser.add_argument('--input_type',
                        default='mfccs',
                        help='Input type: logmel / mfccs')
    args = parser.parse_args()

    # Load from file
    ext = os.path.splitext(args.input_file)[-1]
    print("Reading from file...")
    if ext == '.wav':
        samples = tf.io.read_file(args.input_file)
        decoded = contrib_audio.decode_wav(samples, desired_channels=1)
        audio = decoded.audio
        if args.input_type == 'mfccs':
            inp = audio2mfccs(audio)
        elif args.input_type == 'logmel':
            inp = audio2logmel(audio)
        elif args.input_type == 'spectrograms':
            inp = audio2spectrograms(audio)
        else:
            raise ValueError("%s is not supported" % args.input_type)
    elif ext == '.pkl':
        audio = None
        with open(args.input_file, 'rb') as f:
            x_r = pkl.load(f)
        x_r = tf.squeeze(tf.constant(x_r), 0)
        inp = x_r

Beispiel #17

0

Datei anzeigen

Datei: freeze.py Projekt: helenxhou/SongExplorer

def create_inference_graph(
        wanted_words, sample_rate, nchannels, clip_duration_ms, clip_stride_ms,
        representation, window_size_ms, window_stride_ms, nwindows,
        dct_coefficient_count, filterbank_channel_count, model_architecture,
        filter_counts, filter_sizes, final_filter_len, dropout_prob,
        batch_size, dilate_after_layer, stride_after_layer, connection_type,
        silence_percentage, unknown_percentage):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','),
                                               silence_percentage,
                                               unknown_percentage)
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, nchannels, clip_duration_ms,
        representation, window_size_ms, window_stride_ms, nwindows,
        dct_coefficient_count, filterbank_channel_count, filter_counts,
        filter_sizes, final_filter_len, dropout_prob, batch_size,
        dilate_after_layer, stride_after_layer, connection_type)

    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = audio_ops.decode_wav(
        wav_data_placeholder,
        desired_channels=nchannels,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrograms = []
    for ichannel in range(nchannels):
        spectrograms.append(
            audio_ops.audio_spectrogram(
                decoded_sample_data.audio,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True))
    spectrogram = tf.stack(spectrograms, -1)
    mfccs = []
    for ichannel in range(nchannels):
        mfccs.append(
            audio_ops.mfcc(spectrograms[ichannel],
                           decoded_sample_data.sample_rate,
                           upper_frequency_limit=sample_rate // 2,
                           filterbank_channel_count=filterbank_channel_count,
                           dct_coefficient_count=dct_coefficient_count))
    mfcc = tf.stack(mfccs, -1)

    if representation == 'waveform':
        fingerprint_input = decoded_sample_data.audio
    elif representation == 'spectrogram':
        fingerprint_input = spectrogram
    elif representation == 'mel-cepstrum':
        fingerprint_input = mfcc

    reshaped_input = tf.reshape(fingerprint_input,
                                [-1, model_settings['fingerprint_size']])

    hidden_layers, final = models.create_model(
        reshaped_input,
        model_settings,
        model_architecture,
        is_training=False,
        runtime_settings=runtime_settings)

    # Create an output to use for inference.
    for i in range(len(hidden_layers)):
        tf.identity(hidden_layers[i], name='hidden_layer' + str(i))
    tf.nn.softmax(final, name='output_layer')

Beispiel #18

0

Datei anzeigen

 def fn2audio(fn):
     samples = tf.io.read_file(fn)
     decoded = contrib_audio.decode_wav(samples, desired_channels=1)
     return decoded.audio