Esempio n. 1
0
    def pooling_layer(self, x, pooling_type=None):
        '''
      Add a pooling layer across the whole utterance.
      Input: [B, T, D]
        --> Reduce along T

      Statistics pooling output: [B, D * 2]
      Average pooling output: [B, D]
    '''
        assert_rank3 = tf.debugging.assert_rank(x, 3)
        with tf.control_dependencies([assert_rank3]):
            x = tf.identity(x)

        pooling_type = pooling_type if pooling_type else self.netconf[
            'frame_pooling_type']
        if pooling_type == 'stats':
            with tf.name_scope('stats_pooling'):
                mean, var = tf.nn.moments(x, 1)
                x = tf.concat([mean, tf.sqrt(var + 1e-6)], 1)
        elif pooling_type == 'average':
            with tf.name_scope('average_pooling'):
                mean, _ = tf.nn.moments(x, 1)
                x = mean
        else:
            raise ValueError('Unsupported frame_pooling_type: %s' %
                             (pooling_type))

        assert_rank2 = tf.debugging.assert_rank(x, 2)
        with tf.control_dependencies([assert_rank2]):
            x = tf.identity(x)

        return x
Esempio n. 2
0
    def curvature_range(self):
        # set up the curvature window
        self._curv_win = tf.Variable(np.zeros([
            self._curv_win_width,
        ]),
                                     dtype=tf.float32,
                                     name="curv_win",
                                     trainable=False)
        # we can use log smoothing for curvature range to follow trend faster
        # self._curv_win = tf.scatter_update(
        #   self._curv_win, self._global_step % self._curv_win_width,
        #   tf.log(self._grad_norm_squared + EPS))
        self._curv_win = tf.scatter_update(
            self._curv_win, self._global_step % self._curv_win_width,
            self._grad_norm_squared + EPS)
        # note here the iterations start from iteration 0
        valid_window = tf.slice(
            self._curv_win, tf.constant([
                0,
            ]),
            tf.expand_dims(tf.minimum(tf.constant(self._curv_win_width),
                                      self._global_step + 1),
                           dim=0))

        if self._h_min_log_smooth:
            self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS)
        else:
            self._h_min_t = tf.reduce_min(valid_window)
        if self._h_max_log_smooth:
            self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS)
        else:
            self._h_max_t = tf.reduce_max(valid_window)

        curv_range_ops = []
        with tf.control_dependencies([self._h_min_t, self._h_max_t]):
            avg_op = self._moving_averager.apply(
                [self._h_min_t, self._h_max_t])
            with tf.control_dependencies([avg_op]):
                if self._h_min_log_smooth:
                    self._h_min = tf.exp(
                        tf.identity(
                            self._moving_averager.average(self._h_min_t)))
                else:
                    self._h_min = \
                      tf.identity(self._moving_averager.average(self._h_min_t))
                if self._h_max_log_smooth:
                    self._h_max = tf.exp(
                        tf.identity(
                            self._moving_averager.average(self._h_max_t)))
                else:
                    self._h_max = \
                      tf.identity(self._moving_averager.average(self._h_max_t))
            if self._sparsity_debias:
                self._h_min = self._h_min * self._sparsity_avg
                self._h_max = self._h_max * self._sparsity_avg
        curv_range_ops.append(avg_op)
        return curv_range_ops
Esempio n. 3
0
    def apply_gradients(self, grads_tvars, global_step=None, name=None):
        self._grads, self._tvars = zip(*[(g, t) for g, t in grads_tvars
                                         if g is not None])

        # for manual gradient clipping
        if self._clip_thresh_var is not None:
            self._grads, self._grads_norm = tf.clip_by_global_norm(
                self._grads, self._clip_thresh_var)

        # loosely adaptive clipping of gradient in case exploding gradient ruins statistics
        if self._use_adapt_grad_clip:
            thresh = tf.cond(
                self._do_tune, lambda: tf.sqrt(self._stat_protect_fac * self.
                                               _adapt_grad_clip_thresh**2),
                lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
            self._grads, self._grads_norm = tf.clip_by_global_norm(
                self._grads, thresh)

        with tf.variable_scope("before_apply"):
            before_apply_op = self.before_apply()

        with tf.variable_scope("update_hyper"):
            with tf.control_dependencies([before_apply_op]):
                update_hyper_op = self.update_hyper_param()

        with tf.variable_scope("apply_updates"):
            with tf.control_dependencies([update_hyper_op]):

                # clip exploding gradient according to h_max
                if self._use_adapt_grad_clip:
                    thresh = tf.cond(
                        tf.greater(tf.global_norm(self._grads),
                                   self._adapt_grad_clip_thresh),
                        lambda: self._adapt_grad_clip_target_val,
                        lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
                    self._grads, self._grads_norm = tf.clip_by_global_norm(
                        self._grads, thresh)

                apply_grad_op = self._optimizer.apply_gradients(
                    zip(self._grads, self._tvars), global_step, name)

        with tf.control_dependencies([apply_grad_op]):
            self._increment_global_step_op = tf.assign(self._global_step,
                                                       self._global_step + 1)

            self._adapt_grad_clip_thresh_op = \
              tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) )
            self._adapt_grad_clip_target_val_op = \
              tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) )
            # self._adapt_grad_clip_target_val_op = \
            #   tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min)))

        return tf.group(before_apply_op, update_hyper_op, apply_grad_op,
                        self._adapt_grad_clip_thresh_op,
                        self._adapt_grad_clip_target_val_op,
                        self._increment_global_step_op)
Esempio n. 4
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank && pitch(concat) features of wav.
    :param audio_data: the audio signal from which to compute spectrum.
                       Should be an (1, N) tensor.
    :param sample_rate: the samplerate of the signal we working with.
    :return: A tensor with shape (num_frames, dim_features), containing
            fbank && pitch feature of every frame in speech.
    """

        p = self.config
        with tf.name_scope('fbank_pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate))
                pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate))
                fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1)

                return fbank_pitch_feats
Esempio n. 5
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate pitch features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech.
    """

        p = self.config
        with tf.name_scope('pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=float)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=float))
            with tf.control_dependencies([assert_op]):

                pitch = py_x_ops.pitch(audio_data,
                                       sample_rate,
                                       window_length=p.window_length,
                                       frame_length=p.frame_length,
                                       thres_autoc=p.thres_autoc)

                pitch = tf.squeeze(pitch)
                pitch = tf.transpose(pitch[None, :])
                return pitch
Esempio n. 6
0
 def grad_variance(self):
     grad_var_ops = []
     tensor_to_avg = []
     for t, g in zip(self._tvars, self._grads):
         if isinstance(g, ops.IndexedSlices):
             tensor_to_avg.append(
                 tf.reshape(tf.unsorted_segment_sum(g.values, g.indices,
                                                    g.dense_shape[0]),
                            shape=t.get_shape()))
         else:
             tensor_to_avg.append(g)
     avg_op = self._moving_averager.apply(tensor_to_avg)
     grad_var_ops.append(avg_op)
     with tf.control_dependencies([avg_op]):
         self._grad_avg = [
             self._moving_averager.average(val) for val in tensor_to_avg
         ]
         self._grad_avg_squared = [tf.square(val) for val in self._grad_avg]
     self._grad_var = tf.maximum(
         tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype),
         self._grad_norm_squared_avg -
         tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared]))
     if self._sparsity_debias:
         self._grad_var *= self._sparsity_avg
     return grad_var_ops
Esempio n. 7
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate power spectrum and phase spectrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: Two returns:
        power spectrum —— A float tensor of size (num_frames, num_frequencies) containing
            power spectrum and of every frame in speech.
        phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing
            phase spectrum and of every frame in speech.
    """

        p = self.config
        with tf.name_scope('analyfiltbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                power_spectrum, phase_spectrum = py_x_ops.analyfiltbank(
                    audio_data,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length)

                return power_spectrum, phase_spectrum
Esempio n. 8
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate mfcc features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
            mfcc features of every frame in speech.
    """
        p = self.config
        with tf.name_scope('mfcc'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                spectrum_feats = self.spect(audio_data, sample_rate)
                spectrum_feats = tf.expand_dims(spectrum_feats, 0)
                fbank_feats = self.fbank(audio_data, sample_rate)
                mfcc = py_x_ops.mfcc(fbank_feats,
                                     spectrum_feats,
                                     sample_rate,
                                     use_energy=p.use_energy,
                                     cepstral_lifter=p.cepstral_lifter,
                                     coefficient_count=p.coefficient_count)
                return mfcc
Esempio n. 9
0
  def call(self, audio_data, sample_rate=None):
    """
    Caculate cepstrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return:A float tensor of size (num_frames, ceps_subband_num) containing normalized cepstrum
          (tag_ceps_mean_norm = True) or cepstrum (tag_ceps_mean_norm = False) of every frame in speech.
    """

    p = self.config

    with tf.name_scope('cepstrum'):

      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=float)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float))
      with tf.control_dependencies([assert_op]):

        cepstrum = py_x_ops.cepstrum(
            audio_data,
            sample_rate,
            window_length=p.window_length,
            frame_length=p.frame_length,
            ceps_subband_num=p.ceps_subband_num,
            tag_ceps_mean_norm=p.tag_ceps_mean_norm)

        return cepstrum
Esempio n. 10
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate plp features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp features of every frame in speech.
    """

        p = self.config
        with tf.name_scope('plp'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                plp = py_x_ops.plp(audio_data,
                                   sample_rate,
                                   window_length=p.window_length,
                                   frame_length=p.frame_length,
                                   plp_order=p.plp_order)
                return plp
Esempio n. 11
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate power spectrum or log power spectrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1)
        or log power spectrum (output_type=2) of every frame in speech.
    """

        p = self.config
        with tf.name_scope('spectrum'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                spectrum = py_x_ops.spectrum(
                    audio_data,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length,
                    output_type=p.output_type,
                    snip_edges=p.snip_edges,
                    raw_energy=p.raw_energy,
                    preEph_coeff=p.preeph_coeff,
                    window_type=p.window_type,
                    remove_dc_offset=p.remove_dc_offset,
                    is_fbank=p.is_fbank)

                return spectrum
Esempio n. 12
0
  def call(self, audio_data, sample_rate=None):
    """
        Caculate power spectrum or log power spectrum of audio data.
        :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
        :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
        :return: A float tensor of size N containing add-noise audio.
        """

    p = self.config
    with tf.name_scope('add_rir_noise_aecres'):
      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
      with tf.control_dependencies([assert_op]):
        sample_rate = tf.cast(sample_rate, dtype=float)
        add_rir_noise_aecres_out = py_x_ops.add_rir_noise_aecres(
            audio_data,
            sample_rate,
            if_add_rir=p.if_add_rir,
            rir_filelist=p.rir_filelist,
            if_add_noise=p.if_add_noise,
            snr_min=p.snr_min,
            snr_max=p.snr_max,
            noise_filelist=p.noise_filelist,
            if_add_aecres=p.if_add_aecres,
            aecres_filelist=p.aecres_filelist)

        return tf.squeeze(add_rir_noise_aecres_out)
Esempio n. 13
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
            fbank features of every frame in speech.
    """
        p = self.config
        with tf.name_scope('fbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            if p.upper_frequency_limit <= 0:
                p.upper_frequency_limit = p.sample_rate / 2.0 + p.upper_frequency_limit
            elif (p.upper_frequency_limit <= p.lower_frequency_limit) or (
                    p.upper_frequency_limit > p.sample_rate / 2.0):
                p.upper_frequency_limit = p.sample_rate / 2.0

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                spectrum = self.spect(audio_data, sample_rate)
                spectrum = tf.expand_dims(spectrum, 0)

                fbank = py_x_ops.fbank(
                    spectrum,
                    sample_rate,
                    upper_frequency_limit=p.upper_frequency_limit,
                    lower_frequency_limit=p.lower_frequency_limit,
                    filterbank_channel_count=p.filterbank_channel_count)

                return fbank
Esempio n. 14
0
    def call(self, power_spectrum, phase_spectrum, sample_rate=None):
        """
    Implement frequency domain to time domain conversion.
    :param power_spectrum: a float tensor of size (num_frames, num_frequencies).
    :param phase_spectrum: a float tensor of size (num_frames, num_frequencies).
    :param sample_rate: a scalar tensor.
    :return: audio data
    """

        p = self.config
        with tf.name_scope('synthfiltbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                audio_data = py_x_ops.synthfiltbank(
                    power_spectrum,
                    phase_spectrum,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length)

                return audio_data
Esempio n. 15
0
    def call(self, audio_data, sample_rate=None):
        """
    Calculate the zero-crossing rate of speech.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A tensor with shape (1, num_frames), containing zero-crossing rate of every frame in speech.
    """

        p = self.config
        with tf.name_scope('zcr'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                zcr = py_x_ops.zcr(audio_data,
                                   sample_rate,
                                   window_length=p.window_length,
                                   frame_length=p.frame_length)

                return zcr
Esempio n. 16
0
  def call(self, audio_data, sample_rate=None):
    """
        Caculate power of every frame in speech.
        :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
        :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
        :return:A float tensor of size (1, num_frames) containing power of every frame in speech.
        """

    p = self.config
    with tf.name_scope('framepow'):

      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=float)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=float))
      with tf.control_dependencies([assert_op]):

        framepow = py_x_ops.frame_pow(
            audio_data,
            sample_rate,
            window_length=p.window_length,
            frame_length=p.frame_length)

        return framepow
Esempio n. 17
0
def splice(feat, left_context, right_context):
    '''
  splice frame with context
    param: feat, tf.float32, [batch, time, feat]
    return: feat, tf.float32, [batch, time, feat*(left_context + 1 + right_context)]
    reference:
      https://github.com/kaldi-asr/kaldi/src/feat/feature-functions.cc#L205:6
  '''
    def _loop_continue(time, end_time, context, unused_left_context,
                       right_context, unused_output_tas):
        del unused_output_tas
        del unused_left_context
        return time < end_time

    def _loop_body(time, end_time, context, left_context, right_context,
                   output_tas):
        shape = tf.shape(context)
        B, _, D = shape[0], shape[1], shape[2]
        N = (1 + left_context + right_context) * D

        new_feat = context[:, time:time + left_context + 1 + right_context, :]
        new_feat = tf.reshape(new_feat, [B, N])
        new_output_tas = output_tas.write(time, new_feat)
        return (time + 1, end_time, context, left_context, right_context,
                new_output_tas)

    with tf.control_dependencies([
            tf.assert_greater_equal(left_context, 0),
            tf.assert_greater_equal(right_context, 0)
    ]):
        T = tf.shape(feat)[1]
        output_tas = _new_tensor_array('splice_feat_ta', T, dtype=tf.float32)
        time = tf.constant(0, tf.int32)
        first = tf.tile(feat[:, 0:1, :], [1, left_context, 1])
        last = tf.tile(feat[:, -1:, :], [1, right_context, 1])
        context = tf.concat([first, feat], axis=1)
        context = tf.concat([context, last], axis=1)

        loop_vars = (time, T, context, left_context, right_context, output_tas)

        parallel_iterations = 10
        shape_invariants = tf.nest.map_structure(
            lambda t: tf.TensorShape(None), loop_vars)

        (time, end_time, context, left_context, right_context,
         output_tas) = tf.while_loop(_loop_continue,
                                     _loop_body,
                                     loop_vars=loop_vars,
                                     shape_invariants=shape_invariants,
                                     parallel_iterations=parallel_iterations,
                                     swap_memory=False)
        del context
        del left_context
        del right_context

        batch_spliced_feats = output_tas.stack()
        batch_spliced_feats = tf.transpose(batch_spliced_feats, [1, 0, 2])
    return batch_spliced_feats
Esempio n. 18
0
    def before_apply(self):
        self._moving_averager = tf.train.ExponentialMovingAverage(
            decay=self._beta, zero_debias=self._zero_debias)
        assert self._grads is not None and len(self._grads) > 0
        before_apply_ops = []

        # get per var g**2 and norm**2
        self._grad_squared = []
        self._grad_norm_squared = []
        for v, g in zip(self._tvars, self._grads):
            if g is None:
                continue
            with ops.colocate_with(v):
                self._grad_squared.append(tf.square(g))
        self._grad_norm_squared = [
            tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared
        ]

        if self._sparsity_debias:
            avg_op_sparsity = self.grad_sparsity()
            before_apply_ops.append(avg_op_sparsity)

        # the following running average on squared norm of gradient is shared
        # by `grad_variance` and `dist_to_opt`
        avg_op = self._moving_averager.apply(self._grad_norm_squared)
        with tf.control_dependencies([avg_op]):
            self._grad_norm_squared_avg = [
                self._moving_averager.average(val)
                for val in self._grad_norm_squared
            ]
            self._grad_norm_squared = tf.add_n(self._grad_norm_squared)
            self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg)
        before_apply_ops.append(avg_op)

        with tf.control_dependencies([avg_op]):
            curv_range_ops = self.curvature_range()
            before_apply_ops += curv_range_ops
            grad_var_ops = self.grad_variance()
            before_apply_ops += grad_var_ops
            dist_to_opt_ops = self.dist_to_opt()
            before_apply_ops += dist_to_opt_ops
        return tf.group(*before_apply_ops)
Esempio n. 19
0
  def get_train_op(self, loss, global_step=None):
    """Get the training operator."""
    apply_gradient_op = self.get_apply_gradients_op(loss, global_step)

    # model average
    self.var_avg(global_step)

    # model average after apply gradients
    with tf.control_dependencies([apply_gradient_op]):
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      train_op = tf.group(*update_ops)

    utils.log_vars('moving vars', tf.moving_average_variables())
    return train_op
Esempio n. 20
0
 def dist_to_opt(self):
     dist_to_opt_ops = []
     # running average of the norm of gradeint
     self._grad_norm = tf.sqrt(self._grad_norm_squared)
     avg_op = self._moving_averager.apply([
         self._grad_norm,
     ])
     dist_to_opt_ops.append(avg_op)
     with tf.control_dependencies([avg_op]):
         self._grad_norm_avg = self._moving_averager.average(
             self._grad_norm)
         # single iteration distance estimation
         # note that self._grad_norm_avg is per variable
         self._dist_to_opt = (self._grad_norm_avg /
                              (self._grad_norm_squared_avg + EPS))
     # running average of distance
     avg_op = self._moving_averager.apply([self._dist_to_opt])
     dist_to_opt_ops.append(avg_op)
     with tf.control_dependencies([avg_op]):
         self._dist_to_opt_avg = tf.identity(
             self._moving_averager.average(self._dist_to_opt))
         if self._sparsity_debias:
             self._dist_to_opt_avg /= (tf.sqrt(self._sparsity_avg) + EPS)
     return dist_to_opt_ops
Esempio n. 21
0
    def update_hyper_param(self):
        assign_hyper_ops = []
        self._mu = tf.identity(
            tf.cond(self._do_tune, lambda: self.get_mu_tensor(),
                    lambda: self._mu_var))
        with tf.control_dependencies([self._mu]):
            self._lr = tf.identity(
                tf.cond(self._do_tune, lambda: self.get_lr_tensor(),
                        lambda: self._lr_var))

        with tf.control_dependencies([self._mu, self._lr]):
            if self._use_unsmoothed_lr_mu:
                assign_hyper_ops.append(tf.assign(self._mu_var, self._mu))
                assign_hyper_ops.append(tf.assign(self._lr_var, self._lr))
            else:
                self._mu = self._beta * self._mu_var + (1 -
                                                        self._beta) * self._mu
                self._lr = self._beta * self._lr_var + (1 -
                                                        self._beta) * self._lr
                with tf.control_dependencies([self._mu, self._lr]):
                    assign_hyper_ops.append(tf.assign(self._mu_var, self._mu))
                    assign_hyper_ops.append(tf.assign(self._lr_var, self._lr))
        assign_hyper_op = tf.group(*assign_hyper_ops)
        return assign_hyper_op
Esempio n. 22
0
def accuracy(logits, labels):
    ''' accuracy candies
  params:
    logits: [B, ..., D]
    labels: [B, ...]
  return:
    accuracy tensor
  '''
    with tf.name_scope('accuracy'):
        assert_rank = tf.assert_equal(tf.rank(logits), tf.rank(labels) + 1)
        assert_shape = tf.assert_equal(tf.shape(logits)[:-1], tf.shape(labels))
        with tf.control_dependencies([assert_rank, assert_shape]):
            predictions = tf.argmax(logits, axis=-1, output_type=tf.int64)
            labels = tf.cast(labels, tf.int64)
            return tf.reduce_mean(
                tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
Esempio n. 23
0
 def call(self, wavfile):
     """
 Get audio data and sample rate from a wavfile.
 :param wavfile: filepath of wav
 :return: 2 values. The first is a Tensor of audio data. The second return value is the sample rate of the input wav
     file, which is a tensor with float dtype.
 """
     p = self.config
     contents = tf.io.read_file(wavfile)
     audio_data, sample_rate = tf.audio.decode_wav(
         contents, desired_channels=p.audio_channels)
     assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                 tf.cast(sample_rate, dtype=float))
     with tf.control_dependencies([assert_op]):
         return tf.squeeze(audio_data, axis=-1), tf.cast(sample_rate,
                                                         dtype=float)
Esempio n. 24
0
 def grad_sparsity(self):
     # If the sparse minibatch gradient has 10 percent of its entries
     # non-zero, its sparsity is 0.1.
     # The norm of dense gradient averaged from full dataset
     # are roughly estimated norm of minibatch
     # sparse gradient norm * sqrt(sparsity)
     # An extension maybe only correct the sparse blob.
     non_zero_cnt = tf.add_n([tf.count_nonzero(g) for g in self._grads])
     all_entry_cnt = tf.add_n([tf.size(g) for g in self._grads])
     self._sparsity = tf.cast(non_zero_cnt, self._grads[0].dtype) \
       / tf.cast(all_entry_cnt, self._grads[0].dtype)
     avg_op = self._moving_averager.apply([
         self._sparsity,
     ])
     with tf.control_dependencies([avg_op]):
         self._sparsity_avg = self._moving_averager.average(self._sparsity)
     return avg_op
Esempio n. 25
0
def ctc_lambda_loss(logits, labels, input_length, label_length, blank_index=0):
  '''
  ctc loss function
  psram: logits, (B, T, D)
  psram: input_length,  (B, 1), input length of encoder
  psram: labels, (B, T)
  psram: label_length,  (B, 1), label length for convert dense label to sparse
  returns: loss, scalar
  '''
  ilen = tf.cond(
      pred=tf.equal(tf.rank(input_length), 1),
      true_fn=lambda: input_length,
      false_fn=lambda: tf.squeeze(input_length),
  )
  ilen = tf.cast(ilen, tf.int32)

  olen = tf.cond(
      pred=tf.equal(tf.rank(label_length), 1),
      true_fn=lambda: label_length,
      false_fn=lambda: tf.squeeze(label_length))
  olen = tf.cast(olen, tf.int32)

  deps = [
      tf.assert_rank(labels, 2, name='label_rank_check'),
      tf.assert_rank(logits, 3, name='logits_rank_check'),
      tf.assert_rank(ilen, 1, name='src_len_rank_check'),  # input_length
      tf.assert_rank(olen, 1, name='tgt_len_rank_check'),  # output_length
  ]

  labels, logits = ctc_data_transform(labels, logits, blank_index)

  with tf.control_dependencies(deps):
    # (B, 1)
    # blank index is consistent with Espnet, zero
    batch_loss = tf.nn.ctc_loss(
        labels=labels,
        inputs=logits,
        sequence_length=ilen,
        time_major=False,
        preprocess_collapse_repeated=False,
        ctc_merge_repeated=True,
        ignore_longer_outputs_than_inputs=False)
  return batch_loss
Esempio n. 26
0
  def get_train_op(self, loss, multitask, global_step=None):
    """Get the training operator."""
    # quantize training
    quantconf = self.config['solver']['quantization']
    quantization = quantconf['enable']
    if quantization:
      quant_delay = quantconf['quant_delay']
      logging.info('Quantization training with {} delay'.format(quant_delay))
      tf.contrib.quantize.create_training_graph(quant_delay=quant_delay)

    apply_gradient_op = self.get_apply_gradients_op(loss, multitask,
                                                    global_step)

    # model average
    self.var_avg(global_step)

    # model average after apply gradients
    with tf.control_dependencies([apply_gradient_op]):
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      train_op = tf.group(*update_ops)

    utils.log_vars('moving vars', tf.moving_average_variables())
    return train_op
Esempio n. 27
0
  def call(self, filename, audio_data, sample_rate=None):
    """
    Write wav using audio_data[tensor].
    :param filename: filepath of wav.
    :param audio_data: a tensor containing data of a wav.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: write wav opration.
    """
    p = self.config
    filename = tf.constant(filename)

    if sample_rate == None:
      sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

    assert_op = tf.assert_equal(
        tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
    with tf.control_dependencies([assert_op]):
      audio_data = tf.cast(audio_data, dtype=tf.float32)
      contents = tf.audio.encode_wav(
          tf.expand_dims(audio_data, 1), tf.cast(sample_rate, dtype=tf.int32))
      w = tf.io.write_file(filename, contents)

    return w