Exemple #1
0
    def __init__(self, coordinator, in_dir, logger):
        super(DataFeeder, self).__init__()
        self._coordinator = coordinator
        self._in_dir = in_dir
        self._logger = logger
        self._metadata = load_metadata(os.path.join(in_dir, 'train.txt'),
                                       self._logger)
        random.shuffle(self._metadata)
        self._cursor = 0  # index of the next sample
        self._num_samples = len(self._metadata)
        self._hparams = hparams
        self.batch_size = hparams.get('batch_size')
        self.superbatch_size = hparams.get('superbatch_size')
        self.outputs_per_step = hparams.get('outputs_per_step')

        # Placeholders for inputs and targets.
        self._placeholders = [
            tf.placeholder(tf.int32, [None, None], 'inputs'),
            tf.placeholder(tf.int32, [None], 'input_lengths'),
            tf.placeholder(tf.float32,
                           [None, None, hparams.get('num_mels')],
                           'mel_targets'),
            tf.placeholder(tf.float32,
                           [None, None, hparams.get('num_freq')],
                           'linear_targets')
        ]

        # Create queue of capacity 8 for buffering data which
        # will buffer 8 superbatches onto the FIFO queue
        queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32],
                             name='input_queue')
        self._enqueue_operation = queue.enqueue(self._placeholders)
        self.current_batch = Batch(queue.dequeue(), prep=False)
        self.current_batch.set_shapes(self._placeholders)
Exemple #2
0
def _build_mel_basis():
    '''
        Creates a filterbank matrix to combine FFT bins into
        mel-frequency bins
    '''
    return librosa.filters.mel(hparams.get('sample_rate'),
                               hparams.get('n_fft'),
                               n_mels=hparams.get('num_mels'))
Exemple #3
0
def spectrogram_tensorflow_inv(spect):
    '''Builds computational graph to convert spectrogram to waveform using TensorFlow.

    Unlike spectrogram_inv, this does NOT invert the preemphasis. The caller should call
    inv_preemphasis on the output after running the graph.
  '''
    S = _db_to_amp_tensorflow(
        _denormalize_tensorflow(spect) + hparams.get('ref_level_db'))
    return _griffin_lim_tensorflow(tf.pow(S, hparams.get('power')))
Exemple #4
0
def _normalize_inv(S):
    '''
        Input
        S: Spectrogram

        Unwinds the normalization function applied
        to the spectrogram. This is used in synthesizing
    '''
    return (np.clip(S, 0, 1) *
            -float(hparams.get('min_level_db'))) + hparams.get('min_level_db')
Exemple #5
0
def _normalize(S):
    '''
        Input
        S: Spectrogram

        Returns a normalized version of the spectrogram.
        Since we don't care about absolute volume and only
        care about relatve volume, we pin the spectrogram frequency

    '''
    return np.clip((S - hparams.get('min_level_db')) /
                   -float(hparams.get('min_level_db')), 0, 1)
Exemple #6
0
def spectrogram_inv(spect):
    '''
        Input
        spect: A linear spectrogram

        Convert a spectrogram back to a waveform using the
        Griffin-lim algorithm. This is used in synthesizing
    '''
    # Unwind normalization and dB-scaling
    S = _db_to_amp(_normalize_inv(spect) + hparams.get('ref_level_db'))
    # Apply the Griffin-lim algorithm and unwind the pre-emphasis
    return pre_emphasis_inv(_griffin_lim(S**hparams.get('power')))
Exemple #7
0
 def _prepare_batch(self, outputs_per_step=hparams.get('outputs_per_step')):
     '''
         Prepares both inputs and targets for
         inference
     '''
     self._prepare_inputs()
     self._prepare_targets()
Exemple #8
0
 def get_embedds(self):
     embedding_table = tf.get_variable(
         'embedding',
         [len(chars), hparams.get('embedded_depth')],
         dtype=tf.float32,
         initializer=tf.truncated_normal_initializer(stddev=0.5))
     return tf.nn.embedding_lookup(embedding_table, self._inputs)
Exemple #9
0
def _stft_params():
    '''
        Output
        Given the hyper parameters, return the needed
        parameters for the lirosa STFT method
    
        n_fft: The FFT window size or the num
        hop_length: The number of audio frames between STFT columns
        win_length: Each frame of audio is windowed, where each window
        will be of length win_length and then zero-padded to match up with n_fft
    '''
    n_fft = hparams.get('n_fft')
    hop_length = int(
        hparams.get('frame_shift_ms') / 1000 * hparams.get('sample_rate'))
    win_length = int(
        hparams.get('frame_length_ms') / 1000 * hparams.get('sample_rate'))
    return n_fft, hop_length, win_length
Exemple #10
0
def round_up(x):
    '''
        Given an integer, x, round up x to the closest product of the
        outputs_per_step hyperparameter (5)

        Param:
            x: an integer

        Output:
            x rounded up to outputs_per_step

    '''
    remainder = x % hparams.get('outputs_per_step')
    if remainder == 0:
        return x
    else:
        return x + hparams.get('outputs_per_step') - remainder
Exemple #11
0
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
    window_length = int(hparams.get('sample_rate') * min_silence_sec)
    hop_length = int(window_length / 4)
    threshold = _db_to_amp(threshold_db)
    for x in range(hop_length, len(wav) - window_length, hop_length):
        if np.max(wav[x:x + window_length]) < threshold:
            return x + hop_length
    return len(wav)
Exemple #12
0
def load_metadata(path, logger):
    '''
        Loads the metadata generated by the prep functions
        at the given path
    '''
    with open(path, encoding='utf-8') as f:
        metadata = [line.strip().split('|') for line in f]
        hours = sum(
            (int(x[2])
             for x in metadata)) * hparams.get('frame_shift_ms') / (3600 *
                                                                    1000)
        logger.log('Loaded metadata for %d examples (%.2f hours)' %
                   (len(metadata), hours))
    return metadata
Exemple #13
0
def _griffin_lim_tensorflow(S):
    '''TensorFlow implementation of Griffin-Lim
  Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
  '''
    with tf.variable_scope('griffinlim'):
        # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
        S = tf.expand_dims(S, 0)
        S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
        y = _istft_tensorflow(S_complex)
        for i in range(hparams.get('griffin_lim_iters')):
            est = _stft_tensorflow(y)
            angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
            y = _istft_tensorflow(S_complex * angles)
        return tf.squeeze(y, 0)
Exemple #14
0
def _griffin_lim(spect):
    '''
        Input
        spect: A spectrogram

        Apply the Griffin-Lim Algorithm (GLA) on the spectrogram
        to estimate the signal that has been STFTed
    '''
    angles = np.exp(2j * np.pi * np.random.rand(*spect.shape))
    S_complex = np.abs(spect).astype(np.complex)
    y = _stft_inv(S_complex * angles)
    for _ in range(hparams.get('griffin_lim_iters')):
        angles = np.exp(1j * np.angle(_stft(y)))
        y = _stft_inv(S_complex * angles)
    return y
Exemple #15
0
def pad_input(x, length):
    '''
        Given a list, x, and an int, length, add length - len(x) 
        pad values to the back of the list and return a numpy vector
        
        Param:
            x: a list
            length: an integer
        
        Output:
            A padded numpy vector
    '''
    return np.pad(x, (0, length - x.shape[0]),
                  mode='constant',
                  constant_values=hparams.get('pad_value'))
Exemple #16
0
def mel_spectrogram(y):
    '''
        Input
        y: a numpy array representing a sound signal

        Output
        A normalized mel-scaled spectrogram. A spectrogram is 
        a 3d structure (Time (ms), Frequency (Hz), Volume (dB))
        TODO Thresholding at ref_level_db is never discussed in
        the tacotron paper
    '''
    D = _stft(pre_emphasis(y))
    S = _amp_to_db(_linspect_to_melspect(
        np.abs(D))) - hparams.get('ref_level_db')
    return _normalize(S)
Exemple #17
0
def write_metadata(metadata, output_dir):
    '''
        Writes dataset metadata to train.txt into the given output
        directory that contains the following information for all files:
        "{lin spec file name} | {mel spec file name} | {num frames} | {text}"
    '''
    with open(os.path.join(output_dir, 'train.txt'), 'w',
              encoding='utf-8') as f:
        for m in metadata:
            f.write('|'.join([str(x) for x in m]) + '\n')
        frames = sum([m[2] for m in metadata])
        hours = frames * hparams.get('frame_shift_ms') / (3600 * 1000)
        print('Wrote %d utterances, %d frames (%.2f hours)' %
              (len(metadata), frames, hours))
        print('Max input length:  %d' % max(len(m[3]) for m in metadata))
        print('Max output length: %d' % max(m[2] for m in metadata))
Exemple #18
0
def pad_target(t, length):
    '''
        Given an 2d array representing the target spectrogram where
        the first axis represents time and the second one frequency, and
        an integer, length, add length - len(time axis) pad values to
        the back of the time axis and return the array

        Param:
            t: a numpy 2d array
            length: an integer

        Output:
            A padded 2d numpy array

    '''
    return np.pad(t, [(0, length - t.shape[0]), (0, 0)],
                  mode='constant',
                  constant_values=hparams.get('pad_value'))
Exemple #19
0
def pre_emphasis(x):
    '''
        Input
        x: a numpy array representing a sound signal

        Output
        Applies a pre-emphasis filter on the signal to amplify
        the high frequencies. Given an input signal x, the emphasized
        signal y is described by

            y(t) = x(t) - a*x(t-1),

        where a is the pre emphasis coefficient. 
        
        This is done with lfilter where lfilter(a, b, x) implements
        a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M]
                  - a[1]*y[n-1] - ... - a[N]*y[n-N] 
    '''
    return signal.lfilter([1, -float(hparams.get('preemphasis'))], [1], x)
Exemple #20
0
def spectrogram(y):
    '''
        Input
        y: a numpy array representing a sound signal
            
        Output
        A normalized linear-scale spectrogram. A spectrogram is 
        a 2d structure ([Time (ms), Frequency (Hz)] where values
        are  Volume (dB))
        TODO Thresholding at ref_level_db is never discussed in
        the tacotron paper
    '''
    # D is the short-time Fourier transform result of
    # the pre-emphasizes version of the input signal
    D = _stft(pre_emphasis(y))
    # Convert to a dB-scaled spectrogram and threshold
    # the output at ref_level_db
    S = _amp_to_db(np.abs(D)) - hparams.get('ref_level_db')
    # Finally normalize the output
    return _normalize(S)
Exemple #21
0
def load_wav(path):
    '''
        Loads a single waveform file from
        disk at the given path
    '''
    return librosa.core.load(path, sr=hparams.get('sample_rate'))[0]
Exemple #22
0
def _denormalize_tensorflow(S):
    return (tf.clip_by_value(S, 0, 1) *
            -float(hparams.get('min_level_db'))) + hparams.get('min_level_db')
Exemple #23
0
def pre_emphasis_inv(x):
    '''
        Rewinds the pre emphasis filter. This is used
        in synthesizing
    '''
    return signal.lfilter([1], [1, -float(hparams.get('preemphasis'))], x)
Exemple #24
0
def save_wav(wav, path):
    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
    librosa.output.write_wav(path, wav.astype(np.int16),
                             hparams.get('sample_rate'))