def tdnn_block(self, inputs): ''' TDNN layers. ''' if 'tdnn_method' in self.netconf: tdnn_method = self.netconf['tdnn_method'] else: # Runs faster, support discrete context, for now. tdnn_method = 'splice_layer' tdnn_contexts = self.netconf['tdnn_contexts'] logging.info("tdnn_contexts : {}".format(tdnn_contexts)) tdnn_dims = self.netconf['tdnn_dims'] logging.info("tdnn_dims : {}".format(tdnn_dims)) layer_num = len(tdnn_contexts) assert layer_num == len(tdnn_dims) channels = [self.input_channels] + tdnn_dims logging.info("tdnn_channels : {}".format(channels)) input_h_t = tf.shape(inputs)[1] input_w = inputs.shape[2] input_c = inputs.shape[3] if tdnn_method == 'conv1d': # NHWC -> NW'C, W' = H * W inputs = tf.reshape(inputs, [-1, input_h_t * input_w, input_c]) last_w = channels[0] else: inputs = tf.reshape(inputs, [-1, input_h_t, input_w * input_c]) last_w = input_w * input_c downsample_input_len = self.input_len with tf.variable_scope('tdnn'): x = tf.identity(inputs) for index in range(layer_num): unit_name = 'unit-' + str(index + 1) with tf.variable_scope(unit_name): tdnn_name = 'tdnn-' + str(index + 1) x = common_layers.tdnn(x, tdnn_name, last_w, tdnn_contexts[index], channels[index + 1], has_bias=True, method=tdnn_method) last_w = channels[index + 1] x = tf.nn.relu(x) if self.netconf['use_bn']: bn_name = 'bn' + str(index + 1) x = tf.layers.batch_normalization(x, axis=-1, momentum=0.9, training=self.train, name=bn_name) if self.netconf['use_dropout']: x = tf.layers.dropout(x, self.netconf['dropout_rate'], training=self.train) downsample_input_len = downsample_input_len return x, downsample_input_len
def attention(inputs, attention_size, time_major=False, return_alphas=False): """Attention layer.""" if isinstance(inputs, tuple): # In case of Bi-RNN, concatenate the forward and the backward RNN outputs. inputs = tf.concat(inputs, 2) if time_major: # (T,B,D) => (B,T,D) inputs = tf.transpose(inputs, [1, 0, 2]) time_size = inputs.shape[1].value # T value - time size of the RNN layer hidden_size = inputs.shape[ 2].value # D value - hidden size of the RNN layer # Trainable parameters W_omega = tf.get_variable(name='W_omega', initializer=tf.random_normal( [hidden_size, attention_size], stddev=0.1)) b_omega = tf.get_variable(name='b_omega', initializer=tf.random_normal([attention_size], stddev=0.1)) u_omega = tf.get_variable(name='u_omega', initializer=tf.random_normal([attention_size, 1], stddev=0.1)) # Applying fully connected layer with non-linear activation to each of the B*T timestamps; # the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega) #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega) # (B, T, D) dot (D, Atten) logging.info('attention inputs: {}'.format(inputs.shape)) inputs_reshaped = tf.reshape(inputs, [-1, hidden_size]) dot = tf.matmul(inputs_reshaped, W_omega) dot = tf.reshape(dot, [-1, time_size, attention_size]) v = tf.sigmoid(dot + b_omega) logging.info(f'attention vector: {v.shape}') # For each of the timestamps its vector of size A from `v` is reduced with `u` vector # (B, T, Atten) dot (Atten) #vu = tf.tensordot(v, u_omega, axes=1) # (B,T) shape v = tf.reshape(v, [-1, attention_size]) vu = tf.matmul(v, u_omega) # (B,T) shape vu = tf.squeeze(vu, axis=-1) vu = tf.reshape(vu, [-1, time_size]) logging.info(f'attention energe: {vu.shape}') alphas = tf.nn.softmax(vu) # (B,T) shape also # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape # [batch, time] -> [batch, time, 1] alphas = tf.expand_dims(alphas, -1) # [batch, time, dim] -> [batch, dim] output = tf.reduce_sum(inputs * alphas, 1) if not return_alphas: return output return output, alphas
def grow_topk(i, alive_seq, alive_log_probs, states): """Inner beam search loop.""" flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1]) # (batch_size * beam_size, decoded_length) if states: flat_states = nest.map_structure(_merge_beam_dim, states) flat_logits, flat_states = symbols_to_logits_fn( flat_ids, i, flat_states) states = nest.map_structure( lambda t: _unmerge_beam_dim(t, batch_size, beam_size), flat_states) else: flat_logits = symbols_to_logits_fn(flat_ids) logits = tf.reshape(flat_logits, [batch_size, beam_size, -1]) candidate_log_probs = log_prob_from_logits(logits) log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2) length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha) curr_scores = log_probs / length_penalty flat_curr_scores = tf.reshape(curr_scores, [-1, beam_size * vocab_size]) topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores, k=beam_size * 2) topk_log_probs = topk_scores * length_penalty topk_beam_index = topk_ids // vocab_size topk_ids %= vocab_size # Unflatten the ids batch_pos = compute_batch_indices(batch_size, beam_size * 2) topk_coordinates = tf.stack([batch_pos, topk_beam_index], axis=2) topk_seq = tf.gather_nd(alive_seq, topk_coordinates) if states: states = nest.map_structure( lambda state: tf.gather_nd(state, topk_coordinates), states) topk_seq = tf.concat( [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2) topk_finished = tf.equal(topk_ids, eos_id) return topk_seq, topk_log_probs, topk_scores, topk_finished, states
def call(self, inputs, training=None, mask=None): input_x = inputs["input_x"] if self.use_dense_task: dense_input = inputs["input_dense"] # [batch_size, max_len, embed_len] out = self.embed(input_x) if self.use_pretrained_model: logging.info("use_pretrained_model: {}, {}".format( self.pretrained_model_name, self.pretrained_model_mode)) if self.pretrained_model_name == 'elmo': input_px = self.get_pre_train_graph(input_x) input_px = tf.reshape(input_px, [-1, self.max_len, self.pretrained_model_dim]) out = tf.concat([out, input_px], axis=-1) out = tf.reduce_max(out, axis=1) if self.pretrained_model_name == 'bert': out = self.get_pre_train_graph(input_x) else: out = tf.reduce_max(out, axis=1) out = self.embed_d(out, training=training) if self.use_dense_input: dense_out = self.dense_input_linear(dense_input) if self.only_dense_input: out = dense_out else: out = tf.keras.layers.Concatenate()([out, dense_out]) # [batch_size, class_num] scores = self.final_dense(out) return scores
def grad_variance(self): grad_var_ops = [] tensor_to_avg = [] for t, g in zip(self._tvars, self._grads): if isinstance(g, ops.IndexedSlices): tensor_to_avg.append( tf.reshape(tf.unsorted_segment_sum(g.values, g.indices, g.dense_shape[0]), shape=t.get_shape())) else: tensor_to_avg.append(g) avg_op = self._moving_averager.apply(tensor_to_avg) grad_var_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._grad_avg = [ self._moving_averager.average(val) for val in tensor_to_avg ] self._grad_avg_squared = [tf.square(val) for val in self._grad_avg] self._grad_var = tf.maximum( tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype), self._grad_norm_squared_avg - tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared])) if self._sparsity_debias: self._grad_var *= self._sparsity_avg return grad_var_ops
def tdnn(self, features, n_class, is_train): ''' inp: (batch_size, window_len, feat_dim) ''' inp = features['inputs'] kernel_size = self.cfg['model']['net']['kernel_size'] strides = self.cfg['model']['net']['strides'] num_layers = self.cfg['model']['net']['num_layers'] filters_num = inp.get_shape()[-1] for i in range(num_layers): output = tf.nn.relu( tf.layers.conv1d(inp, filters_num, kernel_size, strides=strides)) output = tf.layers.batch_normalization(output, training=is_train, name='bn%d' % i) inp = output dim = output.get_shape()[1] * output.get_shape()[2] output = tf.reshape(output, [-1, dim]) logits = tf.layers.dense(output, n_class) return logits
def extract_feature(waveforms, params): '''extract fbank with delta-delta and do cmvn waveforms: [batch, samples] ''' p = params with tf.variable_scope('feature_extractor'): mel_fbanks = extract_logfbank_with_delta(waveforms, params) # shape: [1, nframes, nbins, nchannels] fbank_size = utils.shape_list(mel_fbanks) #assert fbank_size[0] == 1 # This replaces CMVN estimation on data if not p.audio_global_cmvn: mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1) variance = tf.reduce_mean(tf.square(mel_fbanks - mean), keepdims=True, axis=1) else: assert p.audio_cmvn_path, p.audio_cmvn_path mean, variance = utils.load_cmvn(p.audio_cmvn_path) var_epsilon = 1e-09 mel_fbanks = utils.apply_cmvn(mel_fbanks, mean, variance, var_epsilon) # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. batch_size = fbank_size[0] feats = tf.concat([ tf.reshape( mel_fbanks, [batch_size, fbank_size[1], fbank_size[2], fbank_size[3]]), tf.zeros((batch_size, p.num_zeropad_frames, fbank_size[2], fbank_size[3])) ], 1) return feats # shape [batch_size, nframes, featue_size, chnanels]
def call(self, inputs, training=None, mask=None): # pylint: disable=too-many-locals input_left = inputs["input_x_left"] input_right = inputs["input_x_right"] embedding = self.embed embed_left = embedding(input_left) embed_right = embedding(input_right) encoded_left = self.lstm_left(embed_left) encoded_right = self.lstm_right(embed_right) encoded_right = tf.transpose(encoded_right, [0, 2, 1]) left_right_sim = tf.matmul(encoded_left, encoded_right) shape_list = left_right_sim.get_shape() newdim = shape_list[1] * shape_list[2] sim_matrix = tf.reshape(left_right_sim, [-1, newdim], name="sim_matrix") dropout = self.dropout(sim_matrix) out = self.outlayer(dropout) scores = self.final_dense(out) return scores
def model(self, feats, labels): ''' Build the model. ''' x = self.resnet(feats) with tf.variable_scope("avg_pooling"): batch_t = tf.shape(x)[0] time_t = tf.shape(x)[1] feat, channel = x.shape.as_list()[2:] x = tf.reshape(x, [batch_t, time_t, feat * channel]) x = self.pooling_layer(x, pooling_type='average') with tf.variable_scope("output_layer"): shape = x.shape.as_list() shape = shape[-1] hidden_dims = self.params().embedding_size y = x y = common_layers.linear(y, 'dense-matmul', [shape, hidden_dims], has_bias=True) y = tf.layers.batch_normalization(y, axis=-1, momentum=0.99, training=self.train, name='dense-bn') embedding = y dense_output = y logits = self.logits_layer(dense_output, labels) model_outputs = {'logits': logits, 'embeddings': embedding} return model_outputs
def call(self, audio_data, sample_rate=None): """ Caculate mfcc features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: the samplerate of the signal we working with. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing mfcc features of every frame in speech. """ p = self.config with tf.name_scope('mfcc'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): fbank_feats = self.fbank(audio_data, sample_rate) sample_rate = tf.cast(sample_rate, dtype=tf.int32) shape = tf.shape(fbank_feats) nframe = shape[0] nfbank = shape[1] fbank_feats = tf.reshape(fbank_feats, (1, nframe, nfbank)) framepow_feats = self.framepow(audio_data, sample_rate) mfcc = py_x_ops.mfcc(fbank_feats, framepow_feats, sample_rate, use_energy=p.use_energy, cepstral_lifter=p.cepstral_lifter, coefficient_count=p.coefficient_count) return mfcc
def call(self, inputs, training=None, mask=None): # pylint: disable=too-many-locals input_x = tf.identity(inputs["input_x"], name='input_x') if self.use_dense_task: dense_input = inputs["input_dense"] if self.use_true_length: # [batch_size, max_doc_len, max_sen_len] input_hx = self.pad_to_hier_input_true_len( input_x, self.max_doc_len, self.max_sen_len, self.split_token, padding_token=self.padding_token) else: # [batch_size, max_doc_len, max_sen_len] input_hx = self.pad_to_hier_input( input_x, self.max_doc_len, self.max_sen_len, padding_token=self.padding_token) # [batch_size, max_doc_len] sen_lens = compute_sen_lens(input_hx, padding_token=self.padding_token) # [batch_size] doc_lens = compute_doc_lens(sen_lens) # [batch_size, max_doc_len, max_sen_len, 1] sen_mask = tf.expand_dims( tf.sequence_mask(sen_lens, self.max_sen_len, dtype=tf.float32), axis=-1) # [batch_size, max_doc_len, 1] doc_mask = tf.expand_dims( tf.sequence_mask(doc_lens, self.max_doc_len, dtype=tf.float32), axis=-1) # [batch_size, max_doc_len, max_sen_len, embed_len] out = self.embed(input_hx) if self.use_pretrained_model: input_px = self.get_pre_train_graph(input_x) input_px = tf.reshape( input_px, [-1, self.max_doc_len, self.max_sen_len, self.pretrained_model_dim]) out = tf.concat([out, input_px], axis=-1) out = self.embed_d(out, training=training) all_sen_encoder = tf.keras.layers.TimeDistributed(self.sen_encoder) # [batch_size, max_doc_len, features] out = all_sen_encoder(out, training=training, mask=sen_mask) # [batch_size, features] out = self.doc_encoder(out, training=training, mask=doc_mask) if self.use_dense_input: dense_out = self.dense_input_linear(dense_input) if self.only_dense_input: out = dense_out else: out = tf.keras.layers.Concatenate()([out, dense_out]) # [batch_size, class_num] scores = self.final_dense(out) return scores
def pad_to_hier_input(inputs, max_doc_len, max_sen_len, padding_token=0): """ Input shape: [batch_size, max_len] New Input shape: [batch_size, max_doc_len, max_sen_len] """ new_len = max_sen_len * max_doc_len new_input = cut_or_padding(inputs, new_len, padding_token=padding_token) new_input = tf.reshape(new_input, [-1, max_doc_len, max_sen_len]) return new_input
def linear_block(self, x): ''' linear layer for dim reduction x: shape [batch, time, feat, channel] output: shape [b, t, f] ''' with tf.variable_scope('linear'): times, feat, channel = x.shape.as_list()[1:] x = tf.reshape(x, [-1, feat * channel]) if self.netconf['use_dropout']: x = tf.layers.dropout(x, self.netconf['dropout_rate'], training=self.train) x = common_layers.linear( x, 'linear1', [feat * channel, self.netconf['linear_num']]) #x = tf.nn.relu6(x) x = tf.reshape(x, [-1, times, self.netconf['linear_num']]) return x
def _loop_body(time, end_time, context, left_context, right_context, output_tas): shape = tf.shape(context) B, _, D = shape[0], shape[1], shape[2] N = (1 + left_context + right_context) * D new_feat = context[:, time:time + left_context + 1 + right_context, :] new_feat = tf.reshape(new_feat, [B, N]) new_output_tas = output_tas.write(time, new_feat) return (time + 1, end_time, context, left_context, right_context, new_output_tas)
def test_maxpool(self): '''test maxpool''' inputs = tf.reshape(tf.range(25), shape=[1, 5, 5, 1]) #A 4D tensor ksize = [3, 3] strides = [1, 1] output = cl.max_pool(inputs, ksize, strides) output_shape = [1, 3, 3, 1] self.assertAllEqual(tf.shape(output), output_shape) output_true = tf.constant([[[[12], [13], [14]], [[17], [18], [19]], [[22], [23], [24]]]]) self.assertAllEqual(output, output_true)
def _reshape_mask(mask): """ repeat mask for multi head Input shape: (Batch size, steps) Output shape: (Batch size * head num, steps) """ if mask is None: return None seq_len = tf.shape(mask)[1] mask = tf.expand_dims(mask, axis=1) mask = tf.tile(mask, [1, self.head_num, 1]) return tf.reshape(mask, shape=(-1, seq_len))
def linear_block(self, x): ''' linear layer for dim reduction x: shape [batch, time, feat, channel] output: shape [b, t, f] ''' batch_t = tf.shape(x)[0] time_t = tf.shape(x)[1] feat, channel = x.shape.as_list()[2:] linear_num = self.netconf['linear_num'] if linear_num > 0: with tf.variable_scope('linear'): x = tf.reshape(x, [batch_t * time_t, feat * channel]) if self.netconf['use_dropout']: x = tf.layers.dropout(x, self.netconf['dropout_rate'], training=self.train) x = common_layers.linear(x, 'linear1', [feat * channel, linear_num]) x = tf.nn.relu(x) if self.netconf['use_bn']: bn_name = 'bn_linear' x = tf.layers.batch_normalization(x, axis=-1, momentum=0.9, training=self.train, name=bn_name) x = tf.reshape(x, [batch_t, time_t, linear_num]) else: logging.info('linear_num <= 0, only apply reshape.') x = tf.reshape(x, [batch_t, time_t, feat * channel]) return x
def call(self, inps, training=None, mask=None): if not self.is_infer: dec_inp, enc_out = inps with tf.name_scope('while'): dec_out = self.decode(dec_inp, enc_out, training, mask) scores = self.final_dense(dec_out) return scores else: enc_out = inps init_ids = tf.cast( tf.ones([utils.shape_list(enc_out)[0]]) * self.sos_id, tf.int32) # Beam Search enc_shape = utils.shape_list(enc_out) enc_out = tf.tile(tf.expand_dims(enc_out, axis=1), [1, self.beam_size, 1, 1]) enc_out = tf.reshape( enc_out, [enc_shape[0] * self.beam_size, enc_shape[1], enc_shape[2]]) enc_mask = tf.tile(tf.expand_dims(mask, axis=1), [1, self.beam_size, 1, 1, 1]) enc_mask = tf.reshape(enc_mask, [enc_shape[0] * self.beam_size, 1, 1, -1]) def symbols_to_logits_fn(dec_inps): dec_out = self.decode(dec_inps, enc_out, training, enc_mask) scores = self.final_dense(dec_out) return scores[:, -1, :] decoded_ids, scores, _ = self.beam_search(symbols_to_logits_fn, init_ids, self.beam_size, self.max_dec_len, self.vocab_size, self.length_penalty, self.eos_id) decoded_ids = decoded_ids[:, 0, 1:] return decoded_ids
def split_heads(self, x, batch_size): """ Split hidden_size into depth(hidden_size // num_heads) for multi-head attention. Args: x: (batch_size, seq_len_x, hidden_size) batch_size Returns: split_x: (batch_size, num_heads, seq_len_x, depth) """ x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) split_x = tf.transpose(x, perm=[0, 2, 1, 3]) return split_x
def call(self, logits=None, input_length=None, labels=None, label_length=None, **kwargs): assert "model" in kwargs model = kwargs["model"] tags_scores = tf.reshape( logits, [-1, model.max_len, model.seq_num_classes], name="scores") loss, _ = crf_log_likelihood(tags_scores, labels, input_length, model.transitions) return loss
def pooling_layer(self, x, time_len): ''' pooling layer''' with tf.variable_scope('time_pooling'): if self.attention: x, self.alphas = common_layers.attention( x, self.netconf['attention_size'], return_alphas=True) #alphas shape [batch, time, 1] -> [1, batch, time, 1]-> [1, time, batch, 1] tf.summary.image( 'alignment', tf.transpose(tf.expand_dims(self.alphas, 0), [0, 2, 1, 3])) else: if self.netconf['use_lstm_layer']: x = tf.concat(x, 2) # [batch, seq_len, dim, 1] x = tf.expand_dims(x, axis=-1) seq_len = time_len x = common_layers.max_pool(x, ksize=[seq_len, 1], strides=[seq_len, 1]) if self.netconf['use_lstm_layer']: x = tf.reshape(x, [-1, 2 * self.netconf['cell_num']]) else: x = tf.reshape(x, [-1, self.netconf['linear_num']]) return x
def call(self, inputs, training=None, mask=None): """ The implementation of Multi-headed attention. Args: inputs = (v, k, q) q: (batch_size, seq_len_q, hidden_size) k: (batch_size, seq_len_k, hidden_size) v: (batch_size, seq_len_v, hidden_size) mask: (batch_size, seq_len_q, seq_len_k) Returns: output: (batch_size, seq_len_q, hidden_size) attention_weights: (batch_size, num_heads, seq_len_q, seq_len_k) """ q, k, v = inputs batch_size = tf.shape(q)[0] q = self.wq(q) # (batch_size, seq_len_q, hidden_size) k = self.wk(k) # (batch_size, seq_len_k, hidden_size) v = self.wv(v) # (batch_size, seq_len_v, hidden_size) q = self.split_heads( q, batch_size) # (batch_size, num_heads, seq_len_q, depth) k = self.split_heads( k, batch_size) # (batch_size, num_heads, seq_len_k, depth) v = self.split_heads( v, batch_size) # (batch_size, num_heads, seq_len_v, depth) # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth) # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k) scaled_attention, attention_weights = self.scaled_dot_product_attention( q, k, v, mask) scaled_attention = tf.transpose( scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth) concat_attention = tf.reshape( scaled_attention, (batch_size, -1, self.hidden_size)) # (batch_size, seq_len_q, hidden_size) output = self.dense( concat_attention) # (batch_size, seq_len_q, hidden_size) return output, attention_weights
def delta_delta(feat, order=2): ''' params: feat: a tensor of shape [nframe, nfbank] or [nframe, nfbank, 1] return: [nframe, nfbank, 3] ''' feat = tf.cond(tf.equal(tf.rank(feat), 3), true_fn=lambda: feat[:, :, 0], false_fn=lambda: feat) shape = tf.shape(feat) # [nframe nfbank*3] nframe = shape[0] nfbank = shape[1] delta = py_x_ops.delta_delta(feat, order=order) feat_with_delta_delta = tf.reshape(delta, (nframe, nfbank, (order + 1))) return feat_with_delta_delta
def test_splice_layer(self): '''test splice layer''' inputs = tf.reshape(tf.range(15), shape=[1, 5, 3]) context = [0, 1] output = cl.splice_layer(inputs, 'splice', context) output_true = tf.constant([[[0, 1, 2, 3, 4, 5], [3, 4, 5, 6, 7, 8], [6, 7, 8, 9, 10, 11], [9, 10, 11, 12, 13, 14], [12, 13, 14, 12, 13, 14]]]) self.assertAllEqual(output, output_true) context = [-1, 0, 1] output = cl.splice_layer(inputs, 'splice', context) output_true = tf.constant([[[0, 1, 2, 0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5, 6, 7, 8], [3, 4, 5, 6, 7, 8, 9, 10, 11], [6, 7, 8, 9, 10, 11, 12, 13, 14], [9, 10, 11, 12, 13, 14, 12, 13, 14]]]) self.assertAllEqual(output, output_true) context = [0, 1, 3] output = cl.splice_layer(inputs, 'splice', context) output_true = tf.constant([[[0, 1, 2, 3, 4, 5, 9, 10, 11], [3, 4, 5, 6, 7, 8, 12, 13, 14], [6, 7, 8, 9, 10, 11, 12, 13, 14], [9, 10, 11, 12, 13, 14, 12, 13, 14], [12, 13, 14, 12, 13, 14, 12, 13, 14]]]) self.assertAllEqual(output, output_true) context = [1, 3] output = cl.splice_layer(inputs, 'splice', context) output_true = tf.constant([[[3, 4, 5, 9, 10, 11], [6, 7, 8, 12, 13, 14], [9, 10, 11, 12, 13, 14], [12, 13, 14, 12, 13, 14], [12, 13, 14, 12, 13, 14]]]) self.assertAllEqual(output, output_true) context = [1, 2, 3] output = cl.splice_layer(inputs, 'splice', context) output_true = tf.constant([[[3, 4, 5, 6, 7, 8, 9, 10, 11], [6, 7, 8, 9, 10, 11, 12, 13, 14], [9, 10, 11, 12, 13, 14, 12, 13, 14], [12, 13, 14, 12, 13, 14, 12, 13, 14], [12, 13, 14, 12, 13, 14, 12, 13, 14]]]) self.assertAllEqual(output, output_true)
def call(self, feat, order, window): """ Caculate delta of feats. :param feat: a float tensor of size (num_frames, dim_feat). :param order: an int. :param window: an int. :return: A tensor with shape (num_frames, dim_feats, order + 1), containing delta of features of every frame in speech. """ p = self.config with tf.name_scope('delta_delta'): delta_delta = py_x_ops.delta_delta(feat, order, window) n_frame, n_feats = feat.get_shape().as_list() delta_delta = tf.reshape(delta_delta, (n_frame, n_feats, order + 1)) return delta_delta
def splice_layer(x, name, context): ''' Splice a tensor along the last dimension with context. e.g.: t = [[[1, 2, 3], [4, 5, 6], [7, 8, 9]]] splice_tensor(t, [0, 1]) = [[[1, 2, 3, 4, 5, 6], [4, 5, 6, 7, 8, 9], [7, 8, 9, 7, 8, 9]]] Args: tensor: a tf.Tensor with shape (B, T, D) a.k.a. (N, H, W) context: a list of context offsets Returns: spliced tensor with shape (..., D * len(context)) ''' with tf.variable_scope(name): input_shape = tf.shape(x) B, T = input_shape[0], input_shape[1] context_len = len(context) array = tf.TensorArray(x.dtype, size=context_len) for idx, offset in enumerate(context): begin = offset end = T + offset if begin < 0: begin = 0 sliced = x[:, begin:end, :] tiled = tf.tile(x[:, 0:1, :], [1, abs(offset), 1]) final = tf.concat((tiled, sliced), axis=1) else: end = T sliced = x[:, begin:end, :] tiled = tf.tile(x[:, -1:, :], [1, abs(offset), 1]) final = tf.concat((sliced, tiled), axis=1) array = array.write(idx, final) spliced = array.stack() spliced = tf.transpose(spliced, (1, 2, 0, 3)) spliced = tf.reshape(spliced, (B, T, -1)) return spliced
def conv_pool(embedded_chars_expanded, filter_sizes, embedding_size, num_filters, sequence_length): """ text conv and max pooling to get one-dimension vector to representation of text :param filter_sizes: :return: """ pooled_outputs = [] for _, filter_size in enumerate(filter_sizes): with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.get_variable(name='W', initializer=tf.truncated_normal(filter_shape, stddev=0.1)) b = tf.get_variable(name='b', initializer=tf.constant(0.1, shape=[num_filters])) conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) return h_pool_flat
def compute_batch_indices(batch_size, beam_size): """Computes the i'th coordinate that contains the batch index for gathers.""" batch_pos = tf.range(batch_size * beam_size) // beam_size batch_pos = tf.reshape(batch_pos, [batch_size, beam_size]) return batch_pos
def _unmerge_beam_dim(tensor, batch_size, beam_size): """Reshapes first dimension back to [batch_size, beam_size].""" shape = shape_list(tensor) new_shape = [batch_size] + [beam_size] + shape[1:] return tf.reshape(tensor, new_shape)
def confusion_matrix(logits, labels, num_class): ''' confusion matrix candies ''' return tf.confusion_matrix(labels=tf.reshape(labels, [-1]), predictions=tf.reshape(tf.argmax(logits, -1), [-1]), num_classes=num_class)