def call(self, inputs, training=None, mask=None): # pylint: disable=too-many-locals input_x = tf.identity(inputs["input_x"], name='input_x') if self.use_dense_task: dense_input = inputs["input_dense"] if self.use_true_length: # [batch_size, max_doc_len, max_sen_len] input_hx = self.pad_to_hier_input_true_len( input_x, self.max_doc_len, self.max_sen_len, self.split_token, padding_token=self.padding_token) else: # [batch_size, max_doc_len, max_sen_len] input_hx = self.pad_to_hier_input( input_x, self.max_doc_len, self.max_sen_len, padding_token=self.padding_token) # [batch_size, max_doc_len] sen_lens = compute_sen_lens(input_hx, padding_token=self.padding_token) # [batch_size] doc_lens = compute_doc_lens(sen_lens) # [batch_size, max_doc_len, max_sen_len, 1] sen_mask = tf.expand_dims( tf.sequence_mask(sen_lens, self.max_sen_len, dtype=tf.float32), axis=-1) # [batch_size, max_doc_len, 1] doc_mask = tf.expand_dims( tf.sequence_mask(doc_lens, self.max_doc_len, dtype=tf.float32), axis=-1) # [batch_size, max_doc_len, max_sen_len, embed_len] out = self.embed(input_hx) if self.use_pretrained_model: input_px = self.get_pre_train_graph(input_x) input_px = tf.reshape( input_px, [-1, self.max_doc_len, self.max_sen_len, self.pretrained_model_dim]) out = tf.concat([out, input_px], axis=-1) out = self.embed_d(out, training=training) all_sen_encoder = tf.keras.layers.TimeDistributed(self.sen_encoder) # [batch_size, max_doc_len, features] out = all_sen_encoder(out, training=training, mask=sen_mask) # [batch_size, features] out = self.doc_encoder(out, training=training, mask=doc_mask) if self.use_dense_input: dense_out = self.dense_input_linear(dense_input) if self.only_dense_input: out = dense_out else: out = tf.keras.layers.Concatenate()([out, dense_out]) # [batch_size, class_num] scores = self.final_dense(out) return scores
def grow_topk(i, alive_seq, alive_log_probs, states): """Inner beam search loop.""" flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1]) # (batch_size * beam_size, decoded_length) if states: flat_states = nest.map_structure(_merge_beam_dim, states) flat_logits, flat_states = symbols_to_logits_fn( flat_ids, i, flat_states) states = nest.map_structure( lambda t: _unmerge_beam_dim(t, batch_size, beam_size), flat_states) else: flat_logits = symbols_to_logits_fn(flat_ids) logits = tf.reshape(flat_logits, [batch_size, beam_size, -1]) candidate_log_probs = log_prob_from_logits(logits) log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2) length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha) curr_scores = log_probs / length_penalty flat_curr_scores = tf.reshape(curr_scores, [-1, beam_size * vocab_size]) topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2) topk_log_probs = topk_scores * length_penalty topk_beam_index = topk_ids // vocab_size topk_ids %= vocab_size # Unflatten the ids batch_pos = compute_batch_indices(batch_size, beam_size * 2) topk_coordinates = tf.stack([batch_pos, topk_beam_index], axis=2) topk_seq = tf.gather_nd(alive_seq, topk_coordinates) if states: states = nest.map_structure( lambda state: tf.gather_nd(state, topk_coordinates), states) topk_seq = tf.concat( [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2) topk_finished = tf.equal(topk_ids, eos_id) return topk_seq, topk_log_probs, topk_scores, topk_finished, states
def get_pos_embedding_matrix(max_len, embed_dim, use_const, name): """ generate position embedding matrix, two optional types: constant(untrainable) and trainable. Args: max_len, embed_dim, use_const Return: pos_embed: [max_len, embed_dim] """ # First part of the PE function: sin and cos argument if use_const: pos_embed = np.array([[ pos / np.power(10000, (i - i % 2) / embed_dim) for i in range(embed_dim) ] for pos in range(max_len)]) # Second part, apply the cosine to even columns and sin to odds. pos_embed[:, 0::2] = np.sin(pos_embed[:, 0::2]) # dim 2i pos_embed[:, 1::2] = np.cos(pos_embed[:, 1::2]) # dim 2i+1 pos_embed = pos_embed[np.newaxis, ...] pos_embed = tf.cast(pos_embed, dtype=tf.float32) else: pos_embed = tf.get_variable( name=name, shape=[max_len, embed_dim], initializer=tf.random_uniform_initializer(-0.1, 0.1)) pos_embed = tf.expand_dims(pos_embed, 0) return pos_embed
def call(self, inputs, training=None, mask=None): batch_size = tf.shape(inputs)[0] W_3d = tf.tile(tf.expand_dims(self.W, axis=0), tf.stack([batch_size, 1, 1])) # [batch_size, steps, features] input_projection = tf.matmul(inputs, W_3d) if self.use_bias: input_projection += self.b input_projection = tf.tanh(input_projection) # [batch_size, steps, 1] similaritys = tf.reduce_sum(tf.multiply(input_projection, self.attention_context_vector), axis=2, keep_dims=True) # [batch_size, steps, 1] if mask is not None: attention_weights = masked_softmax(similaritys, mask, axis=1) else: attention_weights = tf.nn.softmax(similaritys, axis=1) # [batch_size, features] attention_output = tf.reduce_sum(tf.multiply(inputs, attention_weights), axis=1) return attention_output
def call(self, inputs, training=None, mask=None): input_x = inputs["input_x"] if self.use_dense_task: dense_input = inputs["input_dense"] # [batch_size] lens = self.compute_lens(input_x, self.max_len) # [batch_size, max_len, 1] mask = tf.expand_dims( tf.sequence_mask(lens, self.max_len, dtype=tf.float32), axis=-1) # [batch_size, max_len, embed_len] out = self.embed(input_x) out = self.embed_d(out, training=training) # [batch_size, features] out = self.encoder(out, training=training, mask=mask) if self.use_dense_input: dense_out = self.dense_input_linear(dense_input) if self.only_dense_input: out = dense_out else: out = tf.keras.layers.Concatenate()([out, dense_out]) # [batch_size, class_num] scores = self.final_dense(out) return scores
def _expand_to_beam_size(tensor, beam_size): """Tiles a given tensor by beam_size.""" tensor = tf.expand_dims(tensor, axis=1) tile_dims = [1] * tensor.shape.ndims tile_dims[1] = beam_size return tf.tile(tensor, tile_dims)
def call(self, audio_data, sample_rate=None): """ Caculate fbank features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing fbank features of every frame in speech. """ p = self.config with tf.name_scope('fbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) if p.upper_frequency_limit <= 0: p.upper_frequency_limit = p.sample_rate / 2.0 + p.upper_frequency_limit elif (p.upper_frequency_limit <= p.lower_frequency_limit) or ( p.upper_frequency_limit > p.sample_rate / 2.0): p.upper_frequency_limit = p.sample_rate / 2.0 assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): spectrum = self.spect(audio_data, sample_rate) spectrum = tf.expand_dims(spectrum, 0) fbank = py_x_ops.fbank( spectrum, sample_rate, upper_frequency_limit=p.upper_frequency_limit, lower_frequency_limit=p.lower_frequency_limit, filterbank_channel_count=p.filterbank_channel_count) return fbank
def _create_topk_unique(inputs, k): """Creates the top k values in sorted order with indices.""" height = inputs.shape[0] width = inputs.shape[1] neg_inf_r0 = tf.constant(-np.inf, dtype=tf.float32) ones = tf.ones([height, width], dtype=tf.float32) neg_inf_r2 = ones * neg_inf_r0 inputs = tf.where(tf.is_nan(inputs), neg_inf_r2, inputs) tmp = inputs topk_r2 = tf.zeros([height, k], dtype=tf.float32) for i in range(k): kth_order_statistic = tf.reduce_max(tmp, axis=1, keepdims=True) k_mask = tf.tile( tf.expand_dims(tf.equal(tf.range(k), tf.fill([k], i)), 0), [height, 1]) topk_r2 = tf.where(k_mask, tf.tile(kth_order_statistic, [1, k]), topk_r2) ge_r2 = tf.greater_equal(inputs, tf.tile(kth_order_statistic, [1, width])) tmp = tf.where(ge_r2, neg_inf_r2, inputs) log2_ceiling = int(math.ceil(math.log(float(int(width)), 2))) next_power_of_two = 1 << log2_ceiling count_mask = next_power_of_two - 1 mask_r0 = tf.constant(count_mask) mask_r2 = tf.fill([height, k], mask_r0) topk_r2_s32 = tf.bitcast(topk_r2, tf.int32) topk_indices_r2 = tf.bitwise.bitwise_and(topk_r2_s32, mask_r2) return topk_r2, topk_indices_r2
def call(self, audio_data, sample_rate=None): """ Caculate mfcc features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing mfcc features of every frame in speech. """ p = self.config with tf.name_scope('mfcc'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): spectrum_feats = self.spect(audio_data, sample_rate) spectrum_feats = tf.expand_dims(spectrum_feats, 0) fbank_feats = self.fbank(audio_data, sample_rate) mfcc = py_x_ops.mfcc(fbank_feats, spectrum_feats, sample_rate, use_energy=p.use_energy, cepstral_lifter=p.cepstral_lifter, coefficient_count=p.coefficient_count) return mfcc
def attention(inputs, attention_size, time_major=False, return_alphas=False): """Attention layer.""" if isinstance(inputs, tuple): # In case of Bi-RNN, concatenate the forward and the backward RNN outputs. inputs = tf.concat(inputs, 2) if time_major: # (T,B,D) => (B,T,D) inputs = tf.transpose(inputs, [1, 0, 2]) time_size = inputs.shape[1].value # T value - time size of the RNN layer hidden_size = inputs.shape[ 2].value # D value - hidden size of the RNN layer # Trainable parameters W_omega = tf.get_variable(name='W_omega', initializer=tf.random_normal( [hidden_size, attention_size], stddev=0.1)) b_omega = tf.get_variable(name='b_omega', initializer=tf.random_normal([attention_size], stddev=0.1)) u_omega = tf.get_variable(name='u_omega', initializer=tf.random_normal([attention_size, 1], stddev=0.1)) # Applying fully connected layer with non-linear activation to each of the B*T timestamps; # the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega) #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega) # (B, T, D) dot (D, Atten) logging.info('attention inputs: {}'.format(inputs.shape)) inputs_reshaped = tf.reshape(inputs, [-1, hidden_size]) dot = tf.matmul(inputs_reshaped, W_omega) dot = tf.reshape(dot, [-1, time_size, attention_size]) v = tf.sigmoid(dot + b_omega) logging.info(f'attention vector: {v.shape}') # For each of the timestamps its vector of size A from `v` is reduced with `u` vector # (B, T, Atten) dot (Atten) #vu = tf.tensordot(v, u_omega, axes=1) # (B,T) shape v = tf.reshape(v, [-1, attention_size]) vu = tf.matmul(v, u_omega) # (B,T) shape vu = tf.squeeze(vu, axis=-1) vu = tf.reshape(vu, [-1, time_size]) logging.info(f'attention energe: {vu.shape}') alphas = tf.nn.softmax(vu) # (B,T) shape also # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape # [batch, time] -> [batch, time, 1] alphas = tf.expand_dims(alphas, -1) # [batch, time, dim] -> [batch, dim] output = tf.reduce_sum(inputs * alphas, 1) if not return_alphas: return output return output, alphas
def curvature_range(self): # set up the curvature window self._curv_win = tf.Variable(np.zeros([ self._curv_win_width, ]), dtype=tf.float32, name="curv_win", trainable=False) # we can use log smoothing for curvature range to follow trend faster # self._curv_win = tf.scatter_update( # self._curv_win, self._global_step % self._curv_win_width, # tf.log(self._grad_norm_squared + EPS)) self._curv_win = tf.scatter_update( self._curv_win, self._global_step % self._curv_win_width, self._grad_norm_squared + EPS) # note here the iterations start from iteration 0 valid_window = tf.slice( self._curv_win, tf.constant([ 0, ]), tf.expand_dims(tf.minimum(tf.constant(self._curv_win_width), self._global_step + 1), dim=0)) if self._h_min_log_smooth: self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS) else: self._h_min_t = tf.reduce_min(valid_window) if self._h_max_log_smooth: self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS) else: self._h_max_t = tf.reduce_max(valid_window) curv_range_ops = [] with tf.control_dependencies([self._h_min_t, self._h_max_t]): avg_op = self._moving_averager.apply( [self._h_min_t, self._h_max_t]) with tf.control_dependencies([avg_op]): if self._h_min_log_smooth: self._h_min = tf.exp( tf.identity( self._moving_averager.average(self._h_min_t))) else: self._h_min = \ tf.identity(self._moving_averager.average(self._h_min_t)) if self._h_max_log_smooth: self._h_max = tf.exp( tf.identity( self._moving_averager.average(self._h_max_t))) else: self._h_max = \ tf.identity(self._moving_averager.average(self._h_max_t)) if self._sparsity_debias: self._h_min = self._h_min * self._sparsity_avg self._h_max = self._h_max * self._sparsity_avg curv_range_ops.append(avg_op) return curv_range_ops
def get_expand_pad_mask(inputs, pad_idx): """ get padding mask from the input token idx inputs: [batch_size, time_steps] mask: [batch_size, time_steps, 1] """ pad_mask = tf.cast(tf.math.greater(inputs, pad_idx), tf.float32) pad_mask = tf.expand_dims(pad_mask, -1) return pad_mask
def embedding_look_up(text_inputs, vocab_size, embedding_size): """Embedding layer.""" with tf.variable_scope("embedding"): W = tf.get_variable( name='W', initializer=tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0)) embedding_chars = tf.nn.embedding_lookup(W, text_inputs) embedding_chars_expanded = tf.expand_dims(embedding_chars, -1) return embedding_chars_expanded
def call(self, tensors): """Attention layer.""" left, right = tensors len_left = left.shape[1] len_right = right.shape[1] tensor_left = tf.expand_dims(left, axis=2) tensor_right = tf.expand_dims(right, axis=1) tensor_left = tf.tile(tensor_left, [1, 1, len_right, 1]) tensor_right = tf.tile(tensor_right, [1, len_left, 1, 1]) tensor_merged = tf.concat([tensor_left, tensor_right], axis=-1) middle_output = self.middle_layer(tensor_merged) attn_scores = self.attn(middle_output) attn_scores = tf.squeeze(attn_scores, axis=3) exp_attn_scores = tf.exp( attn_scores - tf.reduce_max(attn_scores, axis=-1, keepdims=True)) exp_sum = tf.reduce_sum(exp_attn_scores, axis=-1, keepdims=True) attention_weights = exp_attn_scores / exp_sum return tf.matmul(attention_weights, right)
def call(self, inputs: list, **kwargs) -> typing.Any: """ The computation logic of DynamicPoolingLayer. :param inputs: two input tensors. """ self._validate_dpool_size() x, dpool_index = inputs dpool_shape = tf.shape(dpool_index) batch_index_one = tf.expand_dims( tf.expand_dims(tf.range(dpool_shape[0]), axis=-1), axis=-1) batch_index = tf.expand_dims( tf.tile(batch_index_one, [1, self._msize1, self._msize2]), axis=-1) dpool_index_ex = tf.concat([batch_index, dpool_index], axis=3) x_expand = tf.gather_nd(x, dpool_index_ex) stride1 = self._msize1 // self._psize1 stride2 = self._msize2 // self._psize2 x_pool = tf.nn.max_pool(x_expand, [1, stride1, stride2, 1], [1, stride1, stride2, 1], "VALID") return x_pool
def generate_cmvn(self, filelist=None, dry_run=False): del filelist assert self._stride == 1.0 batch_size = self.config['solver']['optimizer']['batch_size'] features, labels = self.input_fn( utils.INFER, batch_size, num_epoch=1)().make_one_shot_iterator().get_next() del labels suffix = self.taskconf['suffix'] if suffix == '.npy': logging.info('generate cmvn from numpy') feature = features['inputs'] else: logging.info('genearte cmvn from wav') # tf extractor graph params = feat_lib.speech_ops.speech_params( sr=self.taskconf['audio']['sr'], bins=self.taskconf['audio']['feature_size'], add_delta_deltas=self.taskconf['audio']['add_delta_deltas'], audio_frame_length=self.taskconf['audio']['winlen'], audio_frame_step=self.taskconf['audio']['winstep']) #[batch, Time] -> [batch, time, audio_channel] waveforms = tf.expand_dims(features['inputs'], axis=-1) #[batch, Time, feat_size, channles] feature = feat_lib.speech_ops.batch_extract_feature( waveforms, params) # create stats vars sums, square, count = utils.create_cmvn_statis( self.taskconf['audio']['feature_size'], self.taskconf['audio']['add_delta_deltas']) try: with tf.Session() as sess: while True: feat_np = sess.run(feature) # update stats sums, square, count = utils.update_cmvn_statis(feat_np, sums, square, count, axis=(0, 1)) except tf.errors.OutOfRangeError: pass # compute cmvn mean, var = utils.compute_cmvn(sums, square, count) logging.info('mean:{}'.format(mean)) logging.info('var:{}'.format(var)) if not dry_run: np.save(self._cmvn_path, (mean, var)) logging.info('save cmvn:{}'.format(self._cmvn_path)) logging.info('generate cmvn done')
def _freq_feat_graph(feat_name, **kwargs): winlen = kwargs.get('winlen') winstep = kwargs.get('winstep') feature_size = kwargs.get('feature_size') sr = kwargs.get('sr') #pylint: disable=invalid-name nfft = kwargs.get('nfft') del nfft assert feat_name in ('fbank', 'spec') params = speech_ops.speech_params( sr=sr, bins=feature_size, add_delta_deltas=False, audio_frame_length=winlen, audio_frame_step=winstep) graph = None if feat_name == 'fbank': # get session if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): # fbank filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) del sample_rate fbank = speech_ops.extract_feature(waveforms, params) # shape must be [T, D, C] feat = tf.identity(fbank, name=feat_name) elif feat_name == 'spec': # magnitude spec if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) spec = py_x_ops.spectrum( waveforms[:, 0], tf.cast(sample_rate, tf.dtypes.float32), output_type=1) #output_type: 1, power spec; 2 log power spec spec = tf.sqrt(spec) # shape must be [T, D, C] spec = tf.expand_dims(spec, -1) feat = tf.identity(spec, name=feat_name) else: raise ValueError(f"Not support freq feat: {feat_name}.") return graph, (_get_out_tensor_name('wavpath', 0), _get_out_tensor_name(feat_name, 0))
def _reshape_mask(mask): """ repeat mask for multi head Input shape: (Batch size, steps) Output shape: (Batch size * head num, steps) """ if mask is None: return None seq_len = tf.shape(mask)[1] mask = tf.expand_dims(mask, axis=1) mask = tf.tile(mask, [1, self.head_num, 1]) return tf.reshape(mask, shape=(-1, seq_len))
def call(self, inps, training=None, mask=None): if not self.is_infer: dec_inp, enc_out = inps with tf.name_scope('while'): dec_out = self.decode(dec_inp, enc_out, training, mask) scores = self.final_dense(dec_out) return scores else: enc_out = inps init_ids = tf.cast( tf.ones([utils.shape_list(enc_out)[0]]) * self.sos_id, tf.int32) # Beam Search enc_shape = utils.shape_list(enc_out) enc_out = tf.tile(tf.expand_dims(enc_out, axis=1), [1, self.beam_size, 1, 1]) enc_out = tf.reshape( enc_out, [enc_shape[0] * self.beam_size, enc_shape[1], enc_shape[2]]) enc_mask = tf.tile(tf.expand_dims(mask, axis=1), [1, self.beam_size, 1, 1, 1]) enc_mask = tf.reshape(enc_mask, [enc_shape[0] * self.beam_size, 1, 1, -1]) def symbols_to_logits_fn(dec_inps): dec_out = self.decode(dec_inps, enc_out, training, enc_mask) scores = self.final_dense(dec_out) return scores[:, -1, :] decoded_ids, scores, _ = self.beam_search(symbols_to_logits_fn, init_ids, self.beam_size, self.max_dec_len, self.vocab_size, self.length_penalty, self.eos_id) decoded_ids = decoded_ids[:, 0, 1:] return decoded_ids
def pooling_layer(self, x, time_len): ''' pooling layer''' with tf.variable_scope('time_pooling'): if self.attention: x, self.alphas = common_layers.attention( x, self.netconf['attention_size'], return_alphas=True) #alphas shape [batch, time, 1] -> [1, batch, time, 1]-> [1, time, batch, 1] tf.summary.image( 'alignment', tf.transpose(tf.expand_dims(self.alphas, 0), [0, 2, 1, 3])) else: if self.netconf['use_lstm_layer']: x = tf.concat(x, 2) # [batch, seq_len, dim, 1] x = tf.expand_dims(x, axis=-1) seq_len = time_len x = common_layers.max_pool(x, ksize=[seq_len, 1], strides=[seq_len, 1]) if self.netconf['use_lstm_layer']: x = tf.reshape(x, [-1, 2 * self.netconf['cell_num']]) else: x = tf.reshape(x, [-1, self.netconf['linear_num']]) return x
def _make_example(uttids, feats, ilens, targets, olens): features = { 'uttids': uttids, 'inputs': tf.expand_dims(feats, axis=-1) if not isinstance(feats, np.ndarray) else np.expand_dims(feats, axis=-1), 'input_length': ilens, 'targets': targets, 'target_length': olens } labels = { 'ctc': tf.ones(tf.shape(feats)[0]) if not isinstance(feats, np.ndarray) else np.ones(feats.shape[0]) } # dummy data for dummy loss function return features, labels
def call(self, inputs, training=None, mask=None): input_x = tf.identity(inputs["input_x"], name="input_x") if self.use_dense_task: dense_input = inputs["input_dense"] embed = self.embed(input_x) embed_expand = tf.expand_dims(embed, axis=-1) conv_outs = [conv2d(embed_expand) for conv2d in self.conv2ds] pool_outs = [pool(co) for co, pool in zip(conv_outs, self.pools)] out = tf.keras.layers.Concatenate(axis=1)(pool_outs) out = self.flat(out) out = self.dropout(out, training=training) out = self.dense(out) if self.use_dense_input: dense_out = self.dense_input_linear(dense_input) if self.only_dense_input: out = dense_out else: out = tf.keras.layers.Concatenate()([out, dense_out]) scores = self.final_dense(out) return scores
def _create_make_unique(inputs): """Replaces the lower bits of each element with iota.""" if inputs.shape.ndims != 2: raise ValueError("Input of top_k_with_unique must be rank-2 " "but got: %s" % inputs.shape) height = inputs.shape[0] width = inputs.shape[1] zeros = tf.zeros([height, width], dtype=tf.int32) log2_ceiling = int(math.ceil(math.log(int(width), 2))) next_power_of_two = 1 << log2_ceiling count_mask = ~(next_power_of_two - 1) count_mask_r0 = tf.constant(count_mask) count_mask_r2 = tf.fill([height, width], count_mask_r0) smallest_normal = 1 << 23 smallest_normal_r0 = tf.constant(smallest_normal, dtype=tf.int32) smallest_normal_r2 = tf.fill([height, width], smallest_normal_r0) low_bit_mask = ~(1 << 31) low_bit_mask_r0 = tf.constant(low_bit_mask, dtype=tf.int32) low_bit_mask_r2 = tf.fill([height, width], low_bit_mask_r0) iota = tf.tile(tf.expand_dims(tf.range(width, dtype=tf.int32), 0), [height, 1]) input_r2 = tf.bitcast(inputs, tf.int32) abs_r2 = tf.bitwise.bitwise_and(input_r2, low_bit_mask_r2) if_zero_r2 = tf.equal(abs_r2, zeros) smallest_normal_preserving_sign_r2 = tf.bitwise.bitwise_or( input_r2, smallest_normal_r2) input_no_zeros_r2 = tf.where(if_zero_r2, smallest_normal_preserving_sign_r2, input_r2) and_r2 = tf.bitwise.bitwise_and(input_no_zeros_r2, count_mask_r2) or_r2 = tf.bitwise.bitwise_or(and_r2, iota) return tf.bitcast(or_r2, tf.float32)
def call(self, filename, audio_data, sample_rate=None): """ Write wav using audio_data[tensor]. :param filename: filepath of wav. :param audio_data: a tensor containing data of a wav. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: write wav opration. """ p = self.config filename = tf.constant(filename) if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal( tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): audio_data = tf.cast(audio_data, dtype=tf.float32) contents = tf.audio.encode_wav( tf.expand_dims(audio_data, 1), tf.cast(sample_rate, dtype=tf.int32)) w = tf.io.write_file(filename, contents) return w
def call(self, inputs, training=None, mask=None): input_x = inputs["input_x"] # [batch_size, max_len] input_x_lens = compute_sen_lens(input_x, padding_token=self.padding_token) # [batch_size, max_len, 1] mask = tf.expand_dims(tf.sequence_mask(input_x_lens, self.max_len, dtype=tf.float32), axis=-1) # [batch_size, max_len, embed_len] out = self.embed(input_x) # [batch_size, features] out = self.embed_dropout(out, training=training) out = self.bi_rnn(out) intent_out = self.attention(out, mask=mask) intent_out = self.dropout(intent_out) intent_out = self.intent_dense(intent_out) intent_out = tf.identity(intent_out, name="intent_logits") slots_out = self.dropout(out) slots_out = self.slots_dense(slots_out) slots_out = tf.identity(slots_out, name="slots_logits") return intent_out, slots_out
def extract_logfbank_with_delta(waveforms, params): ''' params: waveforms: float32 tensor with shape [max_len] ''' p = params mel_fbanks = compute_mel_filterbank_features( waveforms, sample_rate=p.audio_sample_rate, preemphasis=p.audio_preemphasis, frame_length=p.audio_frame_length, frame_step=p.audio_frame_step, lower_edge_hertz=p.audio_lower_edge_hertz, upper_edge_hertz=p.audio_upper_edge_hertz, num_mel_bins=p.audio_num_mel_bins, apply_mask=False) if p.audio_add_delta_deltas: mel_fbanks = delta_delta(mel_fbanks) else: mel_fbanks = tf.expand_dims(mel_fbanks, axis=-1) # shape: [nframes, nbins, nchannels] return mel_fbanks
def fbank_feat(powspec, sr=8000, feature_size=40, nfft=512, lowfreq=0, highfreq=None): ''' powspec: [audio_channels, spectrogram_length, spectrogram_feat_dim] return : [auido_chnnels, nframe, nfbank] ''' del nfft true_fn = lambda: tf.expand_dims(powspec, 0) false_fn = lambda: powspec powspec = tf.cond(tf.equal(tf.rank(powspec), 2), true_fn, false_fn) feat = py_x_ops.fbank( powspec, sr, filterbank_channel_count=feature_size, lower_frequency_limit=lowfreq, upper_frequency_limit=highfreq, ) return feat
def call(self, inputs: list, **kwargs) -> typing.Any: """ The computation logic of MatchingLayer. :param inputs: two input tensors. """ x1 = inputs[0] x2 = inputs[1] if self._matching_type == 'dot': if self._normalize: x1 = tf.math.l2_normalize(x1, axis=2) x2 = tf.math.l2_normalize(x2, axis=2) return tf.expand_dims(tf.einsum('abd,acd->abc', x1, x2), 3) else: if self._matching_type == 'mul': def func(x, y): return x * y elif self._matching_type == 'plus': def func(x, y): return x + y elif self._matching_type == 'minus': def func(x, y): return x - y elif self._matching_type == 'concat': def func(x, y): return tf.concat([x, y], axis=3) else: raise ValueError(f"Invalid matching type." f"{self._matching_type} received." f"Mut be in `dot`, `mul`, `plus`, " f"`minus` and `concat`.") x1_exp = tf.stack([x1] * self._shape2[1], 2) x2_exp = tf.stack([x2] * self._shape1[1], 1) return func(x1_exp, x2_exp)
def compute_mel_filterbank_features(waveforms, sample_rate=16000, dither=1.0 / np.iinfo(np.int16).max, preemphasis=0.97, frame_length=25, frame_step=10, fft_length=None, window_fn=functools.partial( tf.signal.hann_window, periodic=True), lower_edge_hertz=80.0, upper_edge_hertz=7600.0, num_mel_bins=80, log_noise_floor=1e-3, apply_mask=True): """Implement mel-filterbank extraction using tf ops. Args: waveforms: float32 tensor with shape [batch_size, max_len] sample_rate: sampling rate of the waveform dither: stddev of Gaussian noise added to waveform to prevent quantization artefacts preemphasis: waveform high-pass filtering constant frame_length: frame length in ms frame_step: frame_Step in ms fft_length: number of fft bins window_fn: windowing function lower_edge_hertz: lowest frequency of the filterbank upper_edge_hertz: highest frequency of the filterbank num_mel_bins: filterbank size log_noise_floor: clip small values to prevent numeric overflow in log apply_mask: When working on a batch of samples, set padding frames to zero Returns: filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1] """ # is a complex64 Tensor representing the short-time Fourier # Transform of each signal in . Its shape is # [batch_size, ?, fft_unique_bins] # where fft_unique_bins = fft_length // 2 + 1 # Find the wave length: the largest index for which the value is !=0 # note that waveforms samples that are exactly 0.0 are quite common, so # simply doing sum(waveforms != 0, axis=-1) will not work correctly. wav_lens = tf.reduce_max( tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) * tf.to_int32(tf.not_equal(waveforms, 0.0)), axis=-1) + 1 if dither > 0: waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither) if preemphasis > 0: waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1] wav_lens -= 1 frame_length = int(frame_length * sample_rate / 1e3) frame_step = int(frame_step * sample_rate / 1e3) if fft_length is None: fft_length = int(2**(np.ceil(np.log2(frame_length)))) stfts = tf.signal.stft(waveforms, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length, window_fn=window_fn, pad_end=True) stft_lens = (wav_lens + (frame_step - 1)) // frame_step masks = tf.to_float( tf.less_equal(tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0), tf.expand_dims(stft_lens, 1))) # An energy spectrogram is the magnitude of the complex-valued STFT. # A float32 Tensor of shape [batch_size, ?, 257]. magnitude_spectrograms = tf.abs(stfts) # Warp the linear-scale, magnitude spectrograms into the mel-scale. num_spectrogram_bins = magnitude_spectrograms.shape[-1].value linear_to_mel_weight_matrix = (tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz)) mel_spectrograms = tf.tensordot(magnitude_spectrograms, linear_to_mel_weight_matrix, 1) # Note: Shape inference for tensordot does not currently handle this case. mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms)) if apply_mask: log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1) return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
def beam_search(symbols_to_logits_fn, initial_ids, beam_size, decode_length, vocab_size, alpha, eos_id, states=None, stop_early=True, INF=1. * 1e20): """Beam search with length penalties.""" batch_size = utils.shape_list(initial_ids)[0] initial_log_probs = tf.constant([[0.] + [-INF] * (beam_size - 1)]) # (batch_size, beam_size) alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1]) alive_seq = utils.expand_to_beam_size(initial_ids, beam_size) # (batch_size, beam_size, 1) alive_seq = tf.expand_dims(alive_seq, axis=2) if states: states = nest.map_structure( lambda state: utils.expand_to_beam_size(state, beam_size), states) else: states = {} # (batch_size, beam_size, 1) finished_seq = tf.zeros(utils.shape_list(alive_seq), tf.int32) # (batch_size, beam_size) finished_scores = tf.ones([batch_size, beam_size]) * -INF # (batch_size, beam_size) finished_flags = tf.zeros([batch_size, beam_size], tf.bool) def grow_finished(finished_seq, finished_scores, finished_flags, curr_seq, curr_scores, curr_finished): """ Given sequences and scores from finished sequence and current finished sequence , will gather the top k=beam size sequences to update finished seq. """ # padding zero for finished seq finished_seq = tf.concat( [finished_seq, tf.zeros([batch_size, beam_size, 1], tf.int32)], axis=2) # mask unfinished curr seq curr_scores += (1. - tf.to_float(curr_finished)) * -INF # concatenating the sequences and scores along beam axis # (batch_size, 2xbeam_size, seq_len) curr_finished_seq = tf.concat([finished_seq, curr_seq], axis=1) curr_finished_scores = tf.concat([finished_scores, curr_scores], axis=1) curr_finished_flags = tf.concat([finished_flags, curr_finished], axis=1) return utils.compute_topk_scores_and_seq( curr_finished_seq, curr_finished_scores, curr_finished_scores, curr_finished_flags, beam_size, batch_size, "grow_finished") def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, states): """Given sequences and scores, will gather the top k=beam size sequences.""" curr_scores += tf.to_float(curr_finished) * -INF return utils.compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs, curr_finished, beam_size, batch_size, "grow_alive", states) def grow_topk(i, alive_seq, alive_log_probs, states): """Inner beam search loop.""" flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1]) # (batch_size * beam_size, decoded_length) if states: flat_states = nest.map_structure(utils.merge_beam_dim, states) flat_logits, flat_states = symbols_to_logits_fn( flat_ids, i, flat_states) states = nest.map_structure( lambda t: utils.unmerge_beam_dim(t, batch_size, beam_size), flat_states) else: flat_logits = symbols_to_logits_fn(flat_ids) logits = tf.reshape(flat_logits, [batch_size, beam_size, -1]) candidate_log_probs = utils.log_prob_from_logits(logits) log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2) length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha) curr_scores = log_probs / length_penalty flat_curr_scores = tf.reshape(curr_scores, [-1, beam_size * vocab_size]) topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2) topk_log_probs = topk_scores * length_penalty topk_beam_index = topk_ids // vocab_size topk_ids %= vocab_size # Unflatten the ids batch_pos = utils.compute_batch_indices(batch_size, beam_size * 2) topk_coordinates = tf.stack([batch_pos, topk_beam_index], axis=2) topk_seq = tf.gather_nd(alive_seq, topk_coordinates) if states: states = nest.map_structure( lambda state: tf.gather_nd(state, topk_coordinates), states) topk_seq = tf.concat( [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2) topk_finished = tf.equal(topk_ids, eos_id) return topk_seq, topk_log_probs, topk_scores, topk_finished, states def inner_loop(i, alive_seq, alive_log_probs, finished_seq, finished_scores, finished_flags, states): """Inner beam search loop.""" topk_seq, topk_log_probs, topk_scores, topk_finished, states = grow_topk( i, alive_seq, alive_log_probs, states) alive_seq, alive_log_probs, _, states = grow_alive( topk_seq, topk_scores, topk_log_probs, topk_finished, states) finished_seq, finished_scores, finished_flags, _ = grow_finished( finished_seq, finished_scores, finished_flags, topk_seq, topk_scores, topk_finished) return (i + 1, alive_seq, alive_log_probs, finished_seq, finished_scores, finished_flags, states) def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq, finished_scores, unused_finished_in_finished, unused_states): """Checking termination condition. """ max_length_penalty = tf.pow( ((5. + tf.to_float(decode_length)) / 6.), alpha) lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty if not stop_early: lowest_score_of_finished_in_finished = tf.reduce_min( finished_scores) else: lowest_score_of_finished_in_finished = tf.reduce_max( finished_scores, axis=1) bound_is_met = tf.reduce_all( tf.greater(lowest_score_of_finished_in_finished, lower_bound_alive_scores)) return tf.logical_and(tf.less(i, decode_length), tf.logical_not(bound_is_met)) inner_shape = tf.TensorShape([None, None, None]) state_struc = nest.map_structure(utils.get_state_shape_invariants, states) (_, alive_seq, alive_log_probs, finished_seq, finished_scores, finished_flags, states) = tf.while_loop( _is_finished, inner_loop, [ tf.constant(0), alive_seq, alive_log_probs, finished_seq, finished_scores, finished_flags, states ], shape_invariants=[ tf.TensorShape([]), inner_shape, alive_log_probs.get_shape(), inner_shape, finished_scores.get_shape(), finished_flags.get_shape(), state_struc ], parallel_iterations=1, back_prop=False) alive_seq.set_shape((None, beam_size, None)) finished_seq.set_shape((None, beam_size, None)) finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq) finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs) return finished_seq, finished_scores, states