コード例 #1
0
  def call(self, inputs, training=None, mask=None):  # pylint: disable=too-many-locals
    input_x = tf.identity(inputs["input_x"], name='input_x')
    if self.use_dense_task:
      dense_input = inputs["input_dense"]
    if self.use_true_length:
      # [batch_size, max_doc_len, max_sen_len]
      input_hx = self.pad_to_hier_input_true_len(
          input_x,
          self.max_doc_len,
          self.max_sen_len,
          self.split_token,
          padding_token=self.padding_token)
    else:
      # [batch_size, max_doc_len, max_sen_len]
      input_hx = self.pad_to_hier_input(
          input_x,
          self.max_doc_len,
          self.max_sen_len,
          padding_token=self.padding_token)

    # [batch_size, max_doc_len]
    sen_lens = compute_sen_lens(input_hx, padding_token=self.padding_token)
    # [batch_size]
    doc_lens = compute_doc_lens(sen_lens)
    # [batch_size, max_doc_len, max_sen_len, 1]
    sen_mask = tf.expand_dims(
        tf.sequence_mask(sen_lens, self.max_sen_len, dtype=tf.float32), axis=-1)

    # [batch_size, max_doc_len, 1]
    doc_mask = tf.expand_dims(
        tf.sequence_mask(doc_lens, self.max_doc_len, dtype=tf.float32), axis=-1)

    # [batch_size, max_doc_len, max_sen_len, embed_len]
    out = self.embed(input_hx)
    if self.use_pretrained_model:
      input_px = self.get_pre_train_graph(input_x)
      input_px = tf.reshape(
          input_px,
          [-1, self.max_doc_len, self.max_sen_len, self.pretrained_model_dim])
      out = tf.concat([out, input_px], axis=-1)
    out = self.embed_d(out, training=training)
    all_sen_encoder = tf.keras.layers.TimeDistributed(self.sen_encoder)
    # [batch_size, max_doc_len, features]
    out = all_sen_encoder(out, training=training, mask=sen_mask)
    # [batch_size, features]
    out = self.doc_encoder(out, training=training, mask=doc_mask)

    if self.use_dense_input:
      dense_out = self.dense_input_linear(dense_input)
      if self.only_dense_input:
        out = dense_out
      else:
        out = tf.keras.layers.Concatenate()([out, dense_out])

    # [batch_size, class_num]
    scores = self.final_dense(out)

    return scores
コード例 #2
0
        def grow_topk(i, alive_seq, alive_log_probs, states):
            """Inner beam search loop."""

            flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1])

            # (batch_size * beam_size, decoded_length)
            if states:
                flat_states = nest.map_structure(_merge_beam_dim, states)
                flat_logits, flat_states = symbols_to_logits_fn(
                    flat_ids, i, flat_states)
                states = nest.map_structure(
                    lambda t: _unmerge_beam_dim(t, batch_size, beam_size),
                    flat_states)
            else:
                flat_logits = symbols_to_logits_fn(flat_ids)

            logits = tf.reshape(flat_logits, [batch_size, beam_size, -1])

            candidate_log_probs = log_prob_from_logits(logits)

            log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs,
                                                             axis=2)

            length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha)

            curr_scores = log_probs / length_penalty
            flat_curr_scores = tf.reshape(curr_scores,
                                          [-1, beam_size * vocab_size])

            topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores,
                                                k=beam_size * 2)

            topk_log_probs = topk_scores * length_penalty

            topk_beam_index = topk_ids // vocab_size
            topk_ids %= vocab_size  # Unflatten the ids
            batch_pos = compute_batch_indices(batch_size, beam_size * 2)
            topk_coordinates = tf.stack([batch_pos, topk_beam_index], axis=2)

            topk_seq = tf.gather_nd(alive_seq, topk_coordinates)
            if states:
                states = nest.map_structure(
                    lambda state: tf.gather_nd(state, topk_coordinates),
                    states)
            topk_seq = tf.concat(
                [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)

            topk_finished = tf.equal(topk_ids, eos_id)

            return topk_seq, topk_log_probs, topk_scores, topk_finished, states
コード例 #3
0
ファイル: sub_tf.py プロジェクト: zhjou/delta
    def get_pos_embedding_matrix(max_len, embed_dim, use_const, name):
        """
    generate position embedding matrix, two optional types:
    constant(untrainable) and trainable.
    Args:
      max_len, embed_dim, use_const

    Return:
      pos_embed: [max_len, embed_dim]
    """
        # First part of the PE function: sin and cos argument
        if use_const:
            pos_embed = np.array([[
                pos / np.power(10000, (i - i % 2) / embed_dim)
                for i in range(embed_dim)
            ] for pos in range(max_len)])

            # Second part, apply the cosine to even columns and sin to odds.
            pos_embed[:, 0::2] = np.sin(pos_embed[:, 0::2])  # dim 2i
            pos_embed[:, 1::2] = np.cos(pos_embed[:, 1::2])  # dim 2i+1
            pos_embed = pos_embed[np.newaxis, ...]
            pos_embed = tf.cast(pos_embed, dtype=tf.float32)
        else:
            pos_embed = tf.get_variable(
                name=name,
                shape=[max_len, embed_dim],
                initializer=tf.random_uniform_initializer(-0.1, 0.1))
            pos_embed = tf.expand_dims(pos_embed, 0)

        return pos_embed
コード例 #4
0
    def call(self, inputs, training=None, mask=None):
        batch_size = tf.shape(inputs)[0]
        W_3d = tf.tile(tf.expand_dims(self.W, axis=0),
                       tf.stack([batch_size, 1, 1]))
        # [batch_size, steps, features]
        input_projection = tf.matmul(inputs, W_3d)

        if self.use_bias:
            input_projection += self.b

        input_projection = tf.tanh(input_projection)

        # [batch_size, steps, 1]
        similaritys = tf.reduce_sum(tf.multiply(input_projection,
                                                self.attention_context_vector),
                                    axis=2,
                                    keep_dims=True)

        # [batch_size, steps, 1]
        if mask is not None:
            attention_weights = masked_softmax(similaritys, mask, axis=1)
        else:
            attention_weights = tf.nn.softmax(similaritys, axis=1)

        # [batch_size, features]
        attention_output = tf.reduce_sum(tf.multiply(inputs,
                                                     attention_weights),
                                         axis=1)
        return attention_output
コード例 #5
0
  def call(self, inputs, training=None, mask=None):
    input_x = inputs["input_x"]
    if self.use_dense_task:
      dense_input = inputs["input_dense"]

    # [batch_size]
    lens = self.compute_lens(input_x, self.max_len)

    # [batch_size, max_len, 1]
    mask = tf.expand_dims(
        tf.sequence_mask(lens, self.max_len, dtype=tf.float32), axis=-1)

    # [batch_size, max_len, embed_len]
    out = self.embed(input_x)
    out = self.embed_d(out, training=training)
    # [batch_size, features]
    out = self.encoder(out, training=training, mask=mask)
    if self.use_dense_input:
      dense_out = self.dense_input_linear(dense_input)
      if self.only_dense_input:
        out = dense_out
      else:
        out = tf.keras.layers.Concatenate()([out, dense_out])
    # [batch_size, class_num]
    scores = self.final_dense(out)
    return scores
コード例 #6
0
def _expand_to_beam_size(tensor, beam_size):
    """Tiles a given tensor by beam_size."""
    tensor = tf.expand_dims(tensor, axis=1)
    tile_dims = [1] * tensor.shape.ndims
    tile_dims[1] = beam_size

    return tf.tile(tensor, tile_dims)
コード例 #7
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
            fbank features of every frame in speech.
    """
        p = self.config
        with tf.name_scope('fbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            if p.upper_frequency_limit <= 0:
                p.upper_frequency_limit = p.sample_rate / 2.0 + p.upper_frequency_limit
            elif (p.upper_frequency_limit <= p.lower_frequency_limit) or (
                    p.upper_frequency_limit > p.sample_rate / 2.0):
                p.upper_frequency_limit = p.sample_rate / 2.0

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                spectrum = self.spect(audio_data, sample_rate)
                spectrum = tf.expand_dims(spectrum, 0)

                fbank = py_x_ops.fbank(
                    spectrum,
                    sample_rate,
                    upper_frequency_limit=p.upper_frequency_limit,
                    lower_frequency_limit=p.lower_frequency_limit,
                    filterbank_channel_count=p.filterbank_channel_count)

                return fbank
コード例 #8
0
def _create_topk_unique(inputs, k):
    """Creates the top k values in sorted order with indices."""
    height = inputs.shape[0]
    width = inputs.shape[1]
    neg_inf_r0 = tf.constant(-np.inf, dtype=tf.float32)
    ones = tf.ones([height, width], dtype=tf.float32)
    neg_inf_r2 = ones * neg_inf_r0
    inputs = tf.where(tf.is_nan(inputs), neg_inf_r2, inputs)

    tmp = inputs
    topk_r2 = tf.zeros([height, k], dtype=tf.float32)
    for i in range(k):
        kth_order_statistic = tf.reduce_max(tmp, axis=1, keepdims=True)
        k_mask = tf.tile(
            tf.expand_dims(tf.equal(tf.range(k), tf.fill([k], i)), 0),
            [height, 1])
        topk_r2 = tf.where(k_mask, tf.tile(kth_order_statistic, [1, k]),
                           topk_r2)
        ge_r2 = tf.greater_equal(inputs,
                                 tf.tile(kth_order_statistic, [1, width]))
        tmp = tf.where(ge_r2, neg_inf_r2, inputs)

    log2_ceiling = int(math.ceil(math.log(float(int(width)), 2)))
    next_power_of_two = 1 << log2_ceiling
    count_mask = next_power_of_two - 1
    mask_r0 = tf.constant(count_mask)
    mask_r2 = tf.fill([height, k], mask_r0)
    topk_r2_s32 = tf.bitcast(topk_r2, tf.int32)
    topk_indices_r2 = tf.bitwise.bitwise_and(topk_r2_s32, mask_r2)
    return topk_r2, topk_indices_r2
コード例 #9
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate mfcc features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
            mfcc features of every frame in speech.
    """
        p = self.config
        with tf.name_scope('mfcc'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                spectrum_feats = self.spect(audio_data, sample_rate)
                spectrum_feats = tf.expand_dims(spectrum_feats, 0)
                fbank_feats = self.fbank(audio_data, sample_rate)
                mfcc = py_x_ops.mfcc(fbank_feats,
                                     spectrum_feats,
                                     sample_rate,
                                     use_energy=p.use_energy,
                                     cepstral_lifter=p.cepstral_lifter,
                                     coefficient_count=p.coefficient_count)
                return mfcc
コード例 #10
0
def attention(inputs, attention_size, time_major=False, return_alphas=False):
    """Attention layer."""
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.transpose(inputs, [1, 0, 2])

    time_size = inputs.shape[1].value  # T value - time size of the RNN layer
    hidden_size = inputs.shape[
        2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    W_omega = tf.get_variable(name='W_omega',
                              initializer=tf.random_normal(
                                  [hidden_size, attention_size], stddev=0.1))
    b_omega = tf.get_variable(name='b_omega',
                              initializer=tf.random_normal([attention_size],
                                                           stddev=0.1))
    u_omega = tf.get_variable(name='u_omega',
                              initializer=tf.random_normal([attention_size, 1],
                                                           stddev=0.1))

    # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
    #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
    #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    # (B, T, D) dot (D, Atten)

    logging.info('attention inputs: {}'.format(inputs.shape))
    inputs_reshaped = tf.reshape(inputs, [-1, hidden_size])
    dot = tf.matmul(inputs_reshaped, W_omega)
    dot = tf.reshape(dot, [-1, time_size, attention_size])
    v = tf.sigmoid(dot + b_omega)
    logging.info(f'attention vector: {v.shape}')
    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    # (B, T, Atten) dot (Atten)
    #vu = tf.tensordot(v, u_omega, axes=1)   # (B,T) shape
    v = tf.reshape(v, [-1, attention_size])
    vu = tf.matmul(v, u_omega)  # (B,T) shape
    vu = tf.squeeze(vu, axis=-1)
    vu = tf.reshape(vu, [-1, time_size])
    logging.info(f'attention energe: {vu.shape}')
    alphas = tf.nn.softmax(vu)  # (B,T) shape also

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    # [batch, time] -> [batch, time, 1]
    alphas = tf.expand_dims(alphas, -1)
    # [batch, time, dim] -> [batch, dim]
    output = tf.reduce_sum(inputs * alphas, 1)

    if not return_alphas:
        return output

    return output, alphas
コード例 #11
0
    def curvature_range(self):
        # set up the curvature window
        self._curv_win = tf.Variable(np.zeros([
            self._curv_win_width,
        ]),
                                     dtype=tf.float32,
                                     name="curv_win",
                                     trainable=False)
        # we can use log smoothing for curvature range to follow trend faster
        # self._curv_win = tf.scatter_update(
        #   self._curv_win, self._global_step % self._curv_win_width,
        #   tf.log(self._grad_norm_squared + EPS))
        self._curv_win = tf.scatter_update(
            self._curv_win, self._global_step % self._curv_win_width,
            self._grad_norm_squared + EPS)
        # note here the iterations start from iteration 0
        valid_window = tf.slice(
            self._curv_win, tf.constant([
                0,
            ]),
            tf.expand_dims(tf.minimum(tf.constant(self._curv_win_width),
                                      self._global_step + 1),
                           dim=0))

        if self._h_min_log_smooth:
            self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS)
        else:
            self._h_min_t = tf.reduce_min(valid_window)
        if self._h_max_log_smooth:
            self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS)
        else:
            self._h_max_t = tf.reduce_max(valid_window)

        curv_range_ops = []
        with tf.control_dependencies([self._h_min_t, self._h_max_t]):
            avg_op = self._moving_averager.apply(
                [self._h_min_t, self._h_max_t])
            with tf.control_dependencies([avg_op]):
                if self._h_min_log_smooth:
                    self._h_min = tf.exp(
                        tf.identity(
                            self._moving_averager.average(self._h_min_t)))
                else:
                    self._h_min = \
                      tf.identity(self._moving_averager.average(self._h_min_t))
                if self._h_max_log_smooth:
                    self._h_max = tf.exp(
                        tf.identity(
                            self._moving_averager.average(self._h_max_t)))
                else:
                    self._h_max = \
                      tf.identity(self._moving_averager.average(self._h_max_t))
            if self._sparsity_debias:
                self._h_min = self._h_min * self._sparsity_avg
                self._h_max = self._h_max * self._sparsity_avg
        curv_range_ops.append(avg_op)
        return curv_range_ops
コード例 #12
0
ファイル: utils.py プロジェクト: zhjou/delta
def get_expand_pad_mask(inputs, pad_idx):
    """
  get padding mask from the input token idx
  inputs: [batch_size, time_steps]
  mask: [batch_size, time_steps, 1]
  """
    pad_mask = tf.cast(tf.math.greater(inputs, pad_idx), tf.float32)
    pad_mask = tf.expand_dims(pad_mask, -1)
    return pad_mask
コード例 #13
0
def embedding_look_up(text_inputs, vocab_size, embedding_size):
  """Embedding layer."""
  with tf.variable_scope("embedding"):
    W = tf.get_variable(
        name='W',
        initializer=tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
    embedding_chars = tf.nn.embedding_lookup(W, text_inputs)
    embedding_chars_expanded = tf.expand_dims(embedding_chars, -1)
  return embedding_chars_expanded
コード例 #14
0
    def call(self, tensors):
        """Attention layer."""
        left, right = tensors

        len_left = left.shape[1]
        len_right = right.shape[1]
        tensor_left = tf.expand_dims(left, axis=2)
        tensor_right = tf.expand_dims(right, axis=1)
        tensor_left = tf.tile(tensor_left, [1, 1, len_right, 1])
        tensor_right = tf.tile(tensor_right, [1, len_left, 1, 1])
        tensor_merged = tf.concat([tensor_left, tensor_right], axis=-1)
        middle_output = self.middle_layer(tensor_merged)
        attn_scores = self.attn(middle_output)
        attn_scores = tf.squeeze(attn_scores, axis=3)
        exp_attn_scores = tf.exp(
            attn_scores - tf.reduce_max(attn_scores, axis=-1, keepdims=True))
        exp_sum = tf.reduce_sum(exp_attn_scores, axis=-1, keepdims=True)
        attention_weights = exp_attn_scores / exp_sum
        return tf.matmul(attention_weights, right)
コード例 #15
0
  def call(self, inputs: list, **kwargs) -> typing.Any:
    """
        The computation logic of DynamicPoolingLayer.
        :param inputs: two input tensors.
        """
    self._validate_dpool_size()
    x, dpool_index = inputs
    dpool_shape = tf.shape(dpool_index)
    batch_index_one = tf.expand_dims(
        tf.expand_dims(tf.range(dpool_shape[0]), axis=-1), axis=-1)
    batch_index = tf.expand_dims(
        tf.tile(batch_index_one, [1, self._msize1, self._msize2]), axis=-1)
    dpool_index_ex = tf.concat([batch_index, dpool_index], axis=3)
    x_expand = tf.gather_nd(x, dpool_index_ex)
    stride1 = self._msize1 // self._psize1
    stride2 = self._msize2 // self._psize2

    x_pool = tf.nn.max_pool(x_expand, [1, stride1, stride2, 1],
                            [1, stride1, stride2, 1], "VALID")
    return x_pool
コード例 #16
0
ファイル: speech_cls_task.py プロジェクト: youisbaby/delta
    def generate_cmvn(self, filelist=None, dry_run=False):
        del filelist
        assert self._stride == 1.0
        batch_size = self.config['solver']['optimizer']['batch_size']
        features, labels = self.input_fn(
            utils.INFER, batch_size,
            num_epoch=1)().make_one_shot_iterator().get_next()
        del labels

        suffix = self.taskconf['suffix']
        if suffix == '.npy':
            logging.info('generate cmvn from numpy')
            feature = features['inputs']
        else:
            logging.info('genearte cmvn from wav')
            # tf extractor graph
            params = feat_lib.speech_ops.speech_params(
                sr=self.taskconf['audio']['sr'],
                bins=self.taskconf['audio']['feature_size'],
                add_delta_deltas=self.taskconf['audio']['add_delta_deltas'],
                audio_frame_length=self.taskconf['audio']['winlen'],
                audio_frame_step=self.taskconf['audio']['winstep'])

            #[batch, Time] -> [batch, time, audio_channel]
            waveforms = tf.expand_dims(features['inputs'], axis=-1)
            #[batch, Time, feat_size, channles]
            feature = feat_lib.speech_ops.batch_extract_feature(
                waveforms, params)

        # create stats vars
        sums, square, count = utils.create_cmvn_statis(
            self.taskconf['audio']['feature_size'],
            self.taskconf['audio']['add_delta_deltas'])
        try:
            with tf.Session() as sess:
                while True:
                    feat_np = sess.run(feature)
                    # update stats
                    sums, square, count = utils.update_cmvn_statis(feat_np,
                                                                   sums,
                                                                   square,
                                                                   count,
                                                                   axis=(0, 1))
        except tf.errors.OutOfRangeError:
            pass

        # compute cmvn
        mean, var = utils.compute_cmvn(sums, square, count)
        logging.info('mean:{}'.format(mean))
        logging.info('var:{}'.format(var))
        if not dry_run:
            np.save(self._cmvn_path, (mean, var))
        logging.info('save cmvn:{}'.format(self._cmvn_path))
        logging.info('generate cmvn done')
コード例 #17
0
def _freq_feat_graph(feat_name, **kwargs):
  winlen = kwargs.get('winlen')
  winstep = kwargs.get('winstep')
  feature_size = kwargs.get('feature_size')
  sr = kwargs.get('sr')  #pylint: disable=invalid-name
  nfft = kwargs.get('nfft')
  del nfft

  assert feat_name in ('fbank', 'spec')

  params = speech_ops.speech_params(
      sr=sr,
      bins=feature_size,
      add_delta_deltas=False,
      audio_frame_length=winlen,
      audio_frame_step=winstep)

  graph = None
  if feat_name == 'fbank':
    # get session
    if feat_name not in _global_sess:
      graph = tf.Graph()
      #pylint: disable=not-context-manager
      with graph.as_default():
        # fbank
        filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath')
        waveforms, sample_rate = speech_ops.read_wav(filepath, params)
        del sample_rate
        fbank = speech_ops.extract_feature(waveforms, params)
        # shape must be [T, D, C]
        feat = tf.identity(fbank, name=feat_name)
  elif feat_name == 'spec':
    # magnitude spec
    if feat_name not in _global_sess:
      graph = tf.Graph()
      #pylint: disable=not-context-manager
      with graph.as_default():
        filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath')
        waveforms, sample_rate = speech_ops.read_wav(filepath, params)

        spec = py_x_ops.spectrum(
            waveforms[:, 0],
            tf.cast(sample_rate, tf.dtypes.float32),
            output_type=1)  #output_type: 1, power spec; 2 log power spec
        spec = tf.sqrt(spec)
        # shape must be [T, D, C]
        spec = tf.expand_dims(spec, -1)
        feat = tf.identity(spec, name=feat_name)
  else:
    raise ValueError(f"Not support freq feat: {feat_name}.")

  return graph, (_get_out_tensor_name('wavpath',
                                      0), _get_out_tensor_name(feat_name, 0))
コード例 #18
0
   def _reshape_mask(mask):
       """
 repeat mask for multi head
   Input shape: (Batch size, steps)
   Output shape: (Batch size * head num, steps)
 """
       if mask is None:
           return None
       seq_len = tf.shape(mask)[1]
       mask = tf.expand_dims(mask, axis=1)
       mask = tf.tile(mask, [1, self.head_num, 1])
       return tf.reshape(mask, shape=(-1, seq_len))
コード例 #19
0
ファイル: transformer.py プロジェクト: lizhanyang505/delta-1
    def call(self, inps, training=None, mask=None):
        if not self.is_infer:
            dec_inp, enc_out = inps
            with tf.name_scope('while'):
                dec_out = self.decode(dec_inp, enc_out, training, mask)
                scores = self.final_dense(dec_out)
                return scores
        else:
            enc_out = inps
            init_ids = tf.cast(
                tf.ones([utils.shape_list(enc_out)[0]]) * self.sos_id,
                tf.int32)
            # Beam Search
            enc_shape = utils.shape_list(enc_out)
            enc_out = tf.tile(tf.expand_dims(enc_out, axis=1),
                              [1, self.beam_size, 1, 1])
            enc_out = tf.reshape(
                enc_out,
                [enc_shape[0] * self.beam_size, enc_shape[1], enc_shape[2]])
            enc_mask = tf.tile(tf.expand_dims(mask, axis=1),
                               [1, self.beam_size, 1, 1, 1])
            enc_mask = tf.reshape(enc_mask,
                                  [enc_shape[0] * self.beam_size, 1, 1, -1])

            def symbols_to_logits_fn(dec_inps):
                dec_out = self.decode(dec_inps, enc_out, training, enc_mask)
                scores = self.final_dense(dec_out)
                return scores[:, -1, :]

            decoded_ids, scores, _ = self.beam_search(symbols_to_logits_fn,
                                                      init_ids, self.beam_size,
                                                      self.max_dec_len,
                                                      self.vocab_size,
                                                      self.length_penalty,
                                                      self.eos_id)
            decoded_ids = decoded_ids[:, 0, 1:]

            return decoded_ids
コード例 #20
0
 def pooling_layer(self, x, time_len):
     ''' pooling layer'''
     with tf.variable_scope('time_pooling'):
         if self.attention:
             x, self.alphas = common_layers.attention(
                 x, self.netconf['attention_size'], return_alphas=True)
             #alphas shape [batch, time, 1] -> [1, batch, time, 1]-> [1, time, batch, 1]
             tf.summary.image(
                 'alignment',
                 tf.transpose(tf.expand_dims(self.alphas, 0), [0, 2, 1, 3]))
         else:
             if self.netconf['use_lstm_layer']:
                 x = tf.concat(x, 2)
             # [batch, seq_len, dim, 1]
             x = tf.expand_dims(x, axis=-1)
             seq_len = time_len
             x = common_layers.max_pool(x,
                                        ksize=[seq_len, 1],
                                        strides=[seq_len, 1])
             if self.netconf['use_lstm_layer']:
                 x = tf.reshape(x, [-1, 2 * self.netconf['cell_num']])
             else:
                 x = tf.reshape(x, [-1, self.netconf['linear_num']])
         return x
コード例 #21
0
def _make_example(uttids, feats, ilens, targets, olens):
    features = {
        'uttids':
        uttids,
        'inputs':
        tf.expand_dims(feats, axis=-1) if not isinstance(feats, np.ndarray)
        else np.expand_dims(feats, axis=-1),
        'input_length':
        ilens,
        'targets':
        targets,
        'target_length':
        olens
    }
    labels = {
        'ctc':
        tf.ones(tf.shape(feats)[0])
        if not isinstance(feats, np.ndarray) else np.ones(feats.shape[0])
    }  # dummy data for dummy loss function
    return features, labels
コード例 #22
0
 def call(self, inputs, training=None, mask=None):
   input_x = tf.identity(inputs["input_x"], name="input_x")
   if self.use_dense_task:
     dense_input = inputs["input_dense"]
   embed = self.embed(input_x)
   embed_expand = tf.expand_dims(embed, axis=-1)
   conv_outs = [conv2d(embed_expand) for conv2d in self.conv2ds]
   pool_outs = [pool(co) for co, pool in zip(conv_outs, self.pools)]
   out = tf.keras.layers.Concatenate(axis=1)(pool_outs)
   out = self.flat(out)
   out = self.dropout(out, training=training)
   out = self.dense(out)
   if self.use_dense_input:
     dense_out = self.dense_input_linear(dense_input)
     if self.only_dense_input:
       out = dense_out
     else:
       out = tf.keras.layers.Concatenate()([out, dense_out])
   scores = self.final_dense(out)
   return scores
コード例 #23
0
def _create_make_unique(inputs):
    """Replaces the lower bits of each element with iota."""
    if inputs.shape.ndims != 2:
        raise ValueError("Input of top_k_with_unique must be rank-2 "
                         "but got: %s" % inputs.shape)

    height = inputs.shape[0]
    width = inputs.shape[1]
    zeros = tf.zeros([height, width], dtype=tf.int32)

    log2_ceiling = int(math.ceil(math.log(int(width), 2)))
    next_power_of_two = 1 << log2_ceiling
    count_mask = ~(next_power_of_two - 1)
    count_mask_r0 = tf.constant(count_mask)
    count_mask_r2 = tf.fill([height, width], count_mask_r0)

    smallest_normal = 1 << 23
    smallest_normal_r0 = tf.constant(smallest_normal, dtype=tf.int32)
    smallest_normal_r2 = tf.fill([height, width], smallest_normal_r0)

    low_bit_mask = ~(1 << 31)
    low_bit_mask_r0 = tf.constant(low_bit_mask, dtype=tf.int32)
    low_bit_mask_r2 = tf.fill([height, width], low_bit_mask_r0)

    iota = tf.tile(tf.expand_dims(tf.range(width, dtype=tf.int32), 0),
                   [height, 1])

    input_r2 = tf.bitcast(inputs, tf.int32)
    abs_r2 = tf.bitwise.bitwise_and(input_r2, low_bit_mask_r2)
    if_zero_r2 = tf.equal(abs_r2, zeros)
    smallest_normal_preserving_sign_r2 = tf.bitwise.bitwise_or(
        input_r2, smallest_normal_r2)
    input_no_zeros_r2 = tf.where(if_zero_r2,
                                 smallest_normal_preserving_sign_r2, input_r2)

    and_r2 = tf.bitwise.bitwise_and(input_no_zeros_r2, count_mask_r2)
    or_r2 = tf.bitwise.bitwise_or(and_r2, iota)
    return tf.bitcast(or_r2, tf.float32)
コード例 #24
0
  def call(self, filename, audio_data, sample_rate=None):
    """
    Write wav using audio_data[tensor].
    :param filename: filepath of wav.
    :param audio_data: a tensor containing data of a wav.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: write wav opration.
    """
    p = self.config
    filename = tf.constant(filename)

    if sample_rate == None:
      sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

    assert_op = tf.assert_equal(
        tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
    with tf.control_dependencies([assert_op]):
      audio_data = tf.cast(audio_data, dtype=tf.float32)
      contents = tf.audio.encode_wav(
          tf.expand_dims(audio_data, 1), tf.cast(sample_rate, dtype=tf.int32))
      w = tf.io.write_file(filename, contents)

    return w
コード例 #25
0
 def call(self, inputs, training=None, mask=None):
     input_x = inputs["input_x"]
     # [batch_size, max_len]
     input_x_lens = compute_sen_lens(input_x,
                                     padding_token=self.padding_token)
     # [batch_size, max_len, 1]
     mask = tf.expand_dims(tf.sequence_mask(input_x_lens,
                                            self.max_len,
                                            dtype=tf.float32),
                           axis=-1)
     # [batch_size, max_len, embed_len]
     out = self.embed(input_x)
     # [batch_size, features]
     out = self.embed_dropout(out, training=training)
     out = self.bi_rnn(out)
     intent_out = self.attention(out, mask=mask)
     intent_out = self.dropout(intent_out)
     intent_out = self.intent_dense(intent_out)
     intent_out = tf.identity(intent_out, name="intent_logits")
     slots_out = self.dropout(out)
     slots_out = self.slots_dense(slots_out)
     slots_out = tf.identity(slots_out, name="slots_logits")
     return intent_out, slots_out
コード例 #26
0
def extract_logfbank_with_delta(waveforms, params):
    '''
   params:
     waveforms: float32 tensor with shape [max_len]
  '''
    p = params
    mel_fbanks = compute_mel_filterbank_features(
        waveforms,
        sample_rate=p.audio_sample_rate,
        preemphasis=p.audio_preemphasis,
        frame_length=p.audio_frame_length,
        frame_step=p.audio_frame_step,
        lower_edge_hertz=p.audio_lower_edge_hertz,
        upper_edge_hertz=p.audio_upper_edge_hertz,
        num_mel_bins=p.audio_num_mel_bins,
        apply_mask=False)

    if p.audio_add_delta_deltas:
        mel_fbanks = delta_delta(mel_fbanks)
    else:
        mel_fbanks = tf.expand_dims(mel_fbanks, axis=-1)
    # shape: [nframes, nbins, nchannels]
    return mel_fbanks
コード例 #27
0
def fbank_feat(powspec,
               sr=8000,
               feature_size=40,
               nfft=512,
               lowfreq=0,
               highfreq=None):
    ''' powspec: [audio_channels, spectrogram_length, spectrogram_feat_dim]
      return : [auido_chnnels, nframe, nfbank]
  '''
    del nfft

    true_fn = lambda: tf.expand_dims(powspec, 0)
    false_fn = lambda: powspec
    powspec = tf.cond(tf.equal(tf.rank(powspec), 2), true_fn, false_fn)

    feat = py_x_ops.fbank(
        powspec,
        sr,
        filterbank_channel_count=feature_size,
        lower_frequency_limit=lowfreq,
        upper_frequency_limit=highfreq,
    )
    return feat
コード例 #28
0
    def call(self, inputs: list, **kwargs) -> typing.Any:
        """
        The computation logic of MatchingLayer.
        :param inputs: two input tensors.
        """
        x1 = inputs[0]
        x2 = inputs[1]
        if self._matching_type == 'dot':
            if self._normalize:
                x1 = tf.math.l2_normalize(x1, axis=2)
                x2 = tf.math.l2_normalize(x2, axis=2)
            return tf.expand_dims(tf.einsum('abd,acd->abc', x1, x2), 3)
        else:
            if self._matching_type == 'mul':

                def func(x, y):
                    return x * y
            elif self._matching_type == 'plus':

                def func(x, y):
                    return x + y
            elif self._matching_type == 'minus':

                def func(x, y):
                    return x - y
            elif self._matching_type == 'concat':

                def func(x, y):
                    return tf.concat([x, y], axis=3)
            else:
                raise ValueError(f"Invalid matching type."
                                 f"{self._matching_type} received."
                                 f"Mut be in `dot`, `mul`, `plus`, "
                                 f"`minus` and `concat`.")
            x1_exp = tf.stack([x1] * self._shape2[1], 2)
            x2_exp = tf.stack([x2] * self._shape1[1], 1)
            return func(x1_exp, x2_exp)
コード例 #29
0
def compute_mel_filterbank_features(waveforms,
                                    sample_rate=16000,
                                    dither=1.0 / np.iinfo(np.int16).max,
                                    preemphasis=0.97,
                                    frame_length=25,
                                    frame_step=10,
                                    fft_length=None,
                                    window_fn=functools.partial(
                                        tf.signal.hann_window, periodic=True),
                                    lower_edge_hertz=80.0,
                                    upper_edge_hertz=7600.0,
                                    num_mel_bins=80,
                                    log_noise_floor=1e-3,
                                    apply_mask=True):
    """Implement mel-filterbank extraction using tf ops.
  Args:
    waveforms: float32 tensor with shape [batch_size, max_len]
    sample_rate: sampling rate of the waveform
    dither: stddev of Gaussian noise added to waveform to prevent quantization
      artefacts
    preemphasis: waveform high-pass filtering constant
    frame_length: frame length in ms
    frame_step: frame_Step in ms
    fft_length: number of fft bins
    window_fn: windowing function
    lower_edge_hertz: lowest frequency of the filterbank
    upper_edge_hertz: highest frequency of the filterbank
    num_mel_bins: filterbank size
    log_noise_floor: clip small values to prevent numeric overflow in log
    apply_mask: When working on a batch of samples, set padding frames to zero
  Returns:
    filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1]
  """
    #  is a complex64 Tensor representing the short-time Fourier
    # Transform of each signal in . Its shape is
    # [batch_size, ?, fft_unique_bins]
    # where fft_unique_bins = fft_length // 2 + 1

    # Find the wave length: the largest index for which the value is !=0
    # note that waveforms samples that are exactly 0.0 are quite common, so
    # simply doing sum(waveforms != 0, axis=-1) will not work correctly.
    wav_lens = tf.reduce_max(
        tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) *
        tf.to_int32(tf.not_equal(waveforms, 0.0)),
        axis=-1) + 1
    if dither > 0:
        waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither)
    if preemphasis > 0:
        waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1]
        wav_lens -= 1
    frame_length = int(frame_length * sample_rate / 1e3)
    frame_step = int(frame_step * sample_rate / 1e3)
    if fft_length is None:
        fft_length = int(2**(np.ceil(np.log2(frame_length))))

    stfts = tf.signal.stft(waveforms,
                           frame_length=frame_length,
                           frame_step=frame_step,
                           fft_length=fft_length,
                           window_fn=window_fn,
                           pad_end=True)

    stft_lens = (wav_lens + (frame_step - 1)) // frame_step
    masks = tf.to_float(
        tf.less_equal(tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0),
                      tf.expand_dims(stft_lens, 1)))

    # An energy spectrogram is the magnitude of the complex-valued STFT.
    # A float32 Tensor of shape [batch_size, ?, 257].
    magnitude_spectrograms = tf.abs(stfts)

    # Warp the linear-scale, magnitude spectrograms into the mel-scale.
    num_spectrogram_bins = magnitude_spectrograms.shape[-1].value
    linear_to_mel_weight_matrix = (tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
        upper_edge_hertz))
    mel_spectrograms = tf.tensordot(magnitude_spectrograms,
                                    linear_to_mel_weight_matrix, 1)
    # Note: Shape inference for tensordot does not currently handle this case.
    mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))

    log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms))

    if apply_mask:
        log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1)

    return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
コード例 #30
0
ファイル: transformer.py プロジェクト: lizhanyang505/delta-1
    def beam_search(symbols_to_logits_fn,
                    initial_ids,
                    beam_size,
                    decode_length,
                    vocab_size,
                    alpha,
                    eos_id,
                    states=None,
                    stop_early=True,
                    INF=1. * 1e20):
        """Beam search with length penalties."""
        batch_size = utils.shape_list(initial_ids)[0]

        initial_log_probs = tf.constant([[0.] + [-INF] * (beam_size - 1)])
        # (batch_size, beam_size)
        alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])

        alive_seq = utils.expand_to_beam_size(initial_ids, beam_size)
        # (batch_size, beam_size, 1)
        alive_seq = tf.expand_dims(alive_seq, axis=2)
        if states:
            states = nest.map_structure(
                lambda state: utils.expand_to_beam_size(state, beam_size),
                states)
        else:
            states = {}

        # (batch_size, beam_size, 1)
        finished_seq = tf.zeros(utils.shape_list(alive_seq), tf.int32)
        # (batch_size, beam_size)
        finished_scores = tf.ones([batch_size, beam_size]) * -INF
        # (batch_size, beam_size)
        finished_flags = tf.zeros([batch_size, beam_size], tf.bool)

        def grow_finished(finished_seq, finished_scores, finished_flags,
                          curr_seq, curr_scores, curr_finished):
            """
        Given sequences and scores from finished sequence and current finished sequence
        , will gather the top k=beam size sequences to update finished seq.
      """
            # padding zero for finished seq
            finished_seq = tf.concat(
                [finished_seq,
                 tf.zeros([batch_size, beam_size, 1], tf.int32)],
                axis=2)

            # mask unfinished curr seq
            curr_scores += (1. - tf.to_float(curr_finished)) * -INF

            # concatenating the sequences and scores along beam axis
            # (batch_size, 2xbeam_size, seq_len)
            curr_finished_seq = tf.concat([finished_seq, curr_seq], axis=1)
            curr_finished_scores = tf.concat([finished_scores, curr_scores],
                                             axis=1)
            curr_finished_flags = tf.concat([finished_flags, curr_finished],
                                            axis=1)
            return utils.compute_topk_scores_and_seq(
                curr_finished_seq, curr_finished_scores, curr_finished_scores,
                curr_finished_flags, beam_size, batch_size, "grow_finished")

        def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished,
                       states):
            """Given sequences and scores, will gather the top k=beam size sequences."""
            curr_scores += tf.to_float(curr_finished) * -INF
            return utils.compute_topk_scores_and_seq(curr_seq, curr_scores,
                                                     curr_log_probs,
                                                     curr_finished, beam_size,
                                                     batch_size, "grow_alive",
                                                     states)

        def grow_topk(i, alive_seq, alive_log_probs, states):
            """Inner beam search loop."""
            flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1])

            # (batch_size * beam_size, decoded_length)
            if states:
                flat_states = nest.map_structure(utils.merge_beam_dim, states)
                flat_logits, flat_states = symbols_to_logits_fn(
                    flat_ids, i, flat_states)
                states = nest.map_structure(
                    lambda t: utils.unmerge_beam_dim(t, batch_size, beam_size),
                    flat_states)
            else:
                flat_logits = symbols_to_logits_fn(flat_ids)

            logits = tf.reshape(flat_logits, [batch_size, beam_size, -1])
            candidate_log_probs = utils.log_prob_from_logits(logits)
            log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs,
                                                             axis=2)

            length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha)

            curr_scores = log_probs / length_penalty
            flat_curr_scores = tf.reshape(curr_scores,
                                          [-1, beam_size * vocab_size])

            topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores,
                                                k=beam_size * 2)
            topk_log_probs = topk_scores * length_penalty

            topk_beam_index = topk_ids // vocab_size
            topk_ids %= vocab_size  # Unflatten the ids
            batch_pos = utils.compute_batch_indices(batch_size, beam_size * 2)
            topk_coordinates = tf.stack([batch_pos, topk_beam_index], axis=2)

            topk_seq = tf.gather_nd(alive_seq, topk_coordinates)
            if states:
                states = nest.map_structure(
                    lambda state: tf.gather_nd(state, topk_coordinates),
                    states)
            topk_seq = tf.concat(
                [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)

            topk_finished = tf.equal(topk_ids, eos_id)

            return topk_seq, topk_log_probs, topk_scores, topk_finished, states

        def inner_loop(i, alive_seq, alive_log_probs, finished_seq,
                       finished_scores, finished_flags, states):
            """Inner beam search loop."""
            topk_seq, topk_log_probs, topk_scores, topk_finished, states = grow_topk(
                i, alive_seq, alive_log_probs, states)
            alive_seq, alive_log_probs, _, states = grow_alive(
                topk_seq, topk_scores, topk_log_probs, topk_finished, states)
            finished_seq, finished_scores, finished_flags, _ = grow_finished(
                finished_seq, finished_scores, finished_flags, topk_seq,
                topk_scores, topk_finished)

            return (i + 1, alive_seq, alive_log_probs, finished_seq,
                    finished_scores, finished_flags, states)

        def _is_finished(i, unused_alive_seq, alive_log_probs,
                         unused_finished_seq, finished_scores,
                         unused_finished_in_finished, unused_states):
            """Checking termination condition.
      """
            max_length_penalty = tf.pow(
                ((5. + tf.to_float(decode_length)) / 6.), alpha)
            lower_bound_alive_scores = alive_log_probs[:,
                                                       0] / max_length_penalty

            if not stop_early:
                lowest_score_of_finished_in_finished = tf.reduce_min(
                    finished_scores)
            else:
                lowest_score_of_finished_in_finished = tf.reduce_max(
                    finished_scores, axis=1)

            bound_is_met = tf.reduce_all(
                tf.greater(lowest_score_of_finished_in_finished,
                           lower_bound_alive_scores))

            return tf.logical_and(tf.less(i, decode_length),
                                  tf.logical_not(bound_is_met))

        inner_shape = tf.TensorShape([None, None, None])

        state_struc = nest.map_structure(utils.get_state_shape_invariants,
                                         states)
        (_, alive_seq, alive_log_probs, finished_seq, finished_scores,
         finished_flags, states) = tf.while_loop(
             _is_finished,
             inner_loop, [
                 tf.constant(0), alive_seq, alive_log_probs, finished_seq,
                 finished_scores, finished_flags, states
             ],
             shape_invariants=[
                 tf.TensorShape([]), inner_shape,
                 alive_log_probs.get_shape(), inner_shape,
                 finished_scores.get_shape(),
                 finished_flags.get_shape(), state_struc
             ],
             parallel_iterations=1,
             back_prop=False)

        alive_seq.set_shape((None, beam_size, None))
        finished_seq.set_shape((None, beam_size, None))
        finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq,
                                alive_seq)
        finished_scores = tf.where(tf.reduce_any(finished_flags, 1),
                                   finished_scores, alive_log_probs)
        return finished_seq, finished_scores, states