def resnet(self, inputs): ''' resnet_block. ''' layers_list = self.netconf['layers_list'] logging.info("layers_list : {}".format(layers_list)) filters_list = self.netconf['filters_list'] logging.info("filters_list : {}".format(filters_list)) strides_list = self.netconf['strides_list'] logging.info("strides_list : {}".format(strides_list)) block_mode = self.netconf['block_mode'] logging.info("block_mode : {}".format(block_mode)) with tf.variable_scope('resnet'): x = tf.identity(inputs) with tf.variable_scope('input_layer'): x = common_layers.conv2d(x, 'input_conv', (3, 3), self.input_channels, filters_list[0], [1, 1], bias=False) x = tf.layers.batch_normalization(x, axis=-1, momentum=0.9, training=self.train, name='input_bn') x = self.prelu_layer(x, 'input_prelu') for index, layer_num in enumerate(layers_list): unit_name = 'resblock-' + str(index + 1) with tf.variable_scope(unit_name): x = self.resnet_block(x, block_mode, layer_num, filters_list[index], filters_list[index + 1], strides_list[index]) return x
def model(self, feats, labels): ''' Build the model. ''' x = self.resnet(feats) with tf.variable_scope("avg_pooling"): batch_t = tf.shape(x)[0] time_t = tf.shape(x)[1] feat, channel = x.shape.as_list()[2:] x = tf.reshape(x, [batch_t, time_t, feat * channel]) x = self.pooling_layer(x, pooling_type='average') with tf.variable_scope("output_layer"): shape = x.shape.as_list() shape = shape[-1] hidden_dims = self.params().embedding_size y = x y = common_layers.linear(y, 'dense-matmul', [shape, hidden_dims], has_bias=True) y = tf.layers.batch_normalization(y, axis=-1, momentum=0.99, training=self.train, name='dense-bn') embedding = y dense_output = y logits = self.logits_layer(dense_output, labels) model_outputs = {'logits': logits, 'embeddings': embedding} return model_outputs
def tdnn_block(self, inputs): ''' TDNN layers. ''' if 'tdnn_method' in self.netconf: tdnn_method = self.netconf['tdnn_method'] else: # Runs faster, support discrete context, for now. tdnn_method = 'splice_layer' tdnn_contexts = self.netconf['tdnn_contexts'] logging.info("tdnn_contexts : {}".format(tdnn_contexts)) tdnn_dims = self.netconf['tdnn_dims'] logging.info("tdnn_dims : {}".format(tdnn_dims)) layer_num = len(tdnn_contexts) assert layer_num == len(tdnn_dims) channels = [self.input_channels] + tdnn_dims logging.info("tdnn_channels : {}".format(channels)) input_h_t = tf.shape(inputs)[1] input_w = inputs.shape[2] input_c = inputs.shape[3] if tdnn_method == 'conv1d': # NHWC -> NW'C, W' = H * W inputs = tf.reshape(inputs, [-1, input_h_t * input_w, input_c]) last_w = channels[0] else: inputs = tf.reshape(inputs, [-1, input_h_t, input_w * input_c]) last_w = input_w * input_c downsample_input_len = self.input_len with tf.variable_scope('tdnn'): x = tf.identity(inputs) for index in range(layer_num): unit_name = 'unit-' + str(index + 1) with tf.variable_scope(unit_name): tdnn_name = 'tdnn-' + str(index + 1) x = common_layers.tdnn(x, tdnn_name, last_w, tdnn_contexts[index], channels[index + 1], has_bias=True, method=tdnn_method) last_w = channels[index + 1] x = tf.nn.relu(x) if self.netconf['use_bn']: bn_name = 'bn' + str(index + 1) x = tf.layers.batch_normalization(x, axis=-1, momentum=0.9, training=self.train, name=bn_name) if self.netconf['use_dropout']: x = tf.layers.dropout(x, self.netconf['dropout_rate'], training=self.train) downsample_input_len = downsample_input_len return x, downsample_input_len
def conv_block(self, inputs, depthwise=False): ''' 2D conv layers. ''' filters = self.netconf['filters'] logging.info("filters : {}".format(filters)) filters_size = self.netconf['filter_size'] logging.info("filters_size : {}".format(filters_size)) filters_strides = self.netconf['filter_stride'] logging.info("filters_strides : {}".format(filters_strides)) pools_size = self.netconf['pool_size'] logging.info("pools_size : {}".format(pools_size)) layer_num = len(filters) assert layer_num == len(filters_size) assert layer_num == len(filters_strides) assert layer_num == len(pools_size) channels = [self.input_channels] + filters logging.info("channels : {}".format(channels)) downsample_input_len = self.input_len with tf.variable_scope('cnn'): x = tf.identity(inputs) for index, filt in enumerate(filters): unit_name = 'unit-' + str(index + 1) with tf.variable_scope(unit_name): if depthwise: x = tf.layers.separable_conv2d( x, filters=filt, kernel_size=filters_size[index], strides=filters_strides[index], padding='same', name=unit_name) else: cnn_name = 'cnn-' + str(index + 1) x = common_layers.conv2d(x, cnn_name, filters_size[index], channels[index], channels[index + 1], filters_strides[index]) x = tf.nn.relu(x) if self.netconf['use_bn']: bn_name = 'bn' + str(index + 1) x = tf.layers.batch_normalization(x, axis=-1, momentum=0.9, training=self.train, name=bn_name) if self.netconf['use_dropout']: x = tf.layers.dropout(x, self.netconf['dropout_rate'], training=self.train) x = common_layers.max_pool(x, pools_size[index], pools_size[index]) downsample_input_len = downsample_input_len / pools_size[ index][0] return x, downsample_input_len
def apply_gradients(self, grads_tvars, global_step=None, name=None): self._grads, self._tvars = zip(*[(g, t) for g, t in grads_tvars if g is not None]) # for manual gradient clipping if self._clip_thresh_var is not None: self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, self._clip_thresh_var) # loosely adaptive clipping of gradient in case exploding gradient ruins statistics if self._use_adapt_grad_clip: thresh = tf.cond( self._do_tune, lambda: tf.sqrt(self._stat_protect_fac * self. _adapt_grad_clip_thresh**2), lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL))) self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, thresh) with tf.variable_scope("before_apply"): before_apply_op = self.before_apply() with tf.variable_scope("update_hyper"): with tf.control_dependencies([before_apply_op]): update_hyper_op = self.update_hyper_param() with tf.variable_scope("apply_updates"): with tf.control_dependencies([update_hyper_op]): # clip exploding gradient according to h_max if self._use_adapt_grad_clip: thresh = tf.cond( tf.greater(tf.global_norm(self._grads), self._adapt_grad_clip_thresh), lambda: self._adapt_grad_clip_target_val, lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL))) self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, thresh) apply_grad_op = self._optimizer.apply_gradients( zip(self._grads, self._tvars), global_step, name) with tf.control_dependencies([apply_grad_op]): self._increment_global_step_op = tf.assign(self._global_step, self._global_step + 1) self._adapt_grad_clip_thresh_op = \ tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) ) self._adapt_grad_clip_target_val_op = \ tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) ) # self._adapt_grad_clip_target_val_op = \ # tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min))) return tf.group(before_apply_op, update_hyper_op, apply_grad_op, self._adapt_grad_clip_thresh_op, self._adapt_grad_clip_target_val_op, self._increment_global_step_op)
def lstm_layer(self, x): ''' LSTM layers. ''' if self.netconf['use_lstm_layer']: with tf.variable_scope('lstm'): cell_fw = tf.contrib.rnn.BasicLSTMCell( self.netconf['cell_num'], forget_bias=1.0) if self.netconf['use_dropout']: cell_fw = tf.contrib.rnn.DropoutWrapper( cell=cell_fw, output_keep_prob=1 - self.netconf['dropout_rate'] if self.train else 1.0) cell_bw = tf.contrib.rnn.BasicLSTMCell( self.netconf['cell_num'], forget_bias=1.0) if self.netconf['use_dropout']: cell_bw = tf.contrib.rnn.DropoutWrapper( cell=cell_bw, output_keep_prob=1 - self.netconf['dropout_rate'] if self.train else 1.0) # Now we feed `linear` into the LSTM BRNN cell and obtain the LSTM BRNN output. outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, inputs=x, dtype=tf.float32, time_major=False, scope='LSTM1') else: outputs = x return outputs
def preprocess(self, inputs): ''' Speech preprocessing. ''' with tf.variable_scope('feature'): if self.input_type == 'samples': # FIXME: stub feats = None else: if 'cmvn_type' in self.audioconf: cmvn_type = self.audioconf['cmvn_type'] else: cmvn_type = 'global' logging.info('cmvn_type: %s' % (cmvn_type)) if cmvn_type == 'global': self.mean, self.std = utils.load_cmvn( self.audioconf['cmvn_path']) feats = utils.apply_cmvn(inputs, self.mean, self.std) elif cmvn_type == 'local': feats = utils.apply_local_cmvn(inputs) elif cmvn_type == 'sliding': raise ValueError('cmvn_type %s not implemented yet.' % (cmvn_type)) elif cmvn_type == 'none': feats = inputs else: raise ValueError('Error cmvn_type %s.' % (cmvn_type)) return feats
def logits_layer(self, x): ''' output layers''' with tf.variable_scope('logits'): logits = common_layers.linear( x, 'logits-matmul', [self.netconf['hidden1'], self.taskconf['classes']['num']]) return logits
def clip_gradients(self, grads_and_vars, clip_ratio): """Clip the gradients.""" is_zip_obj = False if isinstance(grads_and_vars, zip): grads_and_vars = list(grads_and_vars) is_zip_obj = True with tf.variable_scope('grad'): for grad, var in grads_and_vars: if grad is not None: tf.summary.histogram(var.name[:-2], grad) else: logging.debug('%s gradient is None' % (var.name)) # not clip if not clip_ratio: if is_zip_obj: grads, variables = zip(*grads_and_vars) grads_and_vars = zip(grads, variables) return grads_and_vars gradients, variables = zip(*grads_and_vars) clipped, global_norm = tf.clip_by_global_norm(gradients, clip_ratio) grad_and_var_clipped = zip(clipped, variables) tf.summary.scalar('gradient/global_norm', global_norm) return grad_and_var_clipped
def __call__(self, **kwargs): name = kwargs.get('name') kwargs.pop('name') with tf.variable_scope(name): loss = self.call(**kwargs) summary.scalar(name, loss) return loss
def conv2d(x, name, filter_size, in_channels, out_channels, strides, bias=True): """2D convolution.""" with tf.variable_scope(name): kernel = tf.get_variable( name='DW', shape=[filter_size[0], filter_size[1], in_channels, out_channels], dtype=tf.float32, initializer=tf.initializers.glorot_uniform()) if bias: b = tf.get_variable(name='bias', shape=[out_channels], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) out = tf.nn.conv2d(x, kernel, [1, strides[0], strides[1], 1], padding='SAME') if bias: out = tf.nn.bias_add(out, b) return out
def extract_feature(waveforms, params): '''extract fbank with delta-delta and do cmvn waveforms: [batch, samples] ''' p = params with tf.variable_scope('feature_extractor'): mel_fbanks = extract_logfbank_with_delta(waveforms, params) # shape: [1, nframes, nbins, nchannels] fbank_size = utils.shape_list(mel_fbanks) #assert fbank_size[0] == 1 # This replaces CMVN estimation on data if not p.audio_global_cmvn: mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1) variance = tf.reduce_mean(tf.square(mel_fbanks - mean), keepdims=True, axis=1) else: assert p.audio_cmvn_path, p.audio_cmvn_path mean, variance = utils.load_cmvn(p.audio_cmvn_path) var_epsilon = 1e-09 mel_fbanks = utils.apply_cmvn(mel_fbanks, mean, variance, var_epsilon) # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. batch_size = fbank_size[0] feats = tf.concat([ tf.reshape( mel_fbanks, [batch_size, fbank_size[1], fbank_size[2], fbank_size[3]]), tf.zeros((batch_size, p.num_zeropad_frames, fbank_size[2], fbank_size[3])) ], 1) return feats # shape [batch_size, nframes, featue_size, chnanels]
def embedding_look_up(text_inputs, vocab_size, embedding_size): """Embedding layer.""" with tf.variable_scope("embedding"): W = tf.get_variable( name='W', initializer=tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0)) embedding_chars = tf.nn.embedding_lookup(W, text_inputs) embedding_chars_expanded = tf.expand_dims(embedding_chars, -1) return embedding_chars_expanded
def call(self, features, **kwargs): self.train = kwargs['training'] feats = tf.identity(features['inputs'], name='feats') texts = features['texts'] with tf.variable_scope('model', reuse=tf.AUTO_REUSE): feats, texts = self.preprocess(feats, texts) logits = self.model(feats, texts) return logits
def tdnn(x, name, in_dim, context, out_dim, has_bias=True, method='splice_layer'): ''' TDNN implementation. Args: context: a int of left and right context, or a list of context indexes, e.g. (-2, 0, 2). method: splice_layer: use column-first patch-based copy. splice_op: use row-first while_loop copy. conv1d: use conv1d as TDNN equivalence. ''' if hasattr(context, '__iter__'): context_size = len(context) if method in ('splice_op', 'conv1d'): msg = 'Method splice_op and conv1d does not support context list.' raise ValueError(msg) context_list = context else: context_size = context * 2 + 1 context_list = range(-context, context + 1) with tf.variable_scope(name): if method == 'splice_layer': x = splice_layer(x, 'splice', context_list) x = linear(x, 'linear', [in_dim * context_size, out_dim], has_bias=has_bias) elif method == 'splice_op': x = speech_ops.splice(x, context, context) x = linear(x, 'linear', [in_dim * context_size, out_dim], has_bias=has_bias) elif method == 'conv1d': kernel = tf.get_variable( name='DW', shape=[context, in_dim, out_dim], dtype=tf.float32, initializer=tf.glorot_uniform_initializer()) x = tf.nn.conv1d(x, kernel, stride=1, padding='SAME') if has_bias: b = tf.get_variable(name='bias', shape=[out_dim], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) x = tf.nn.bias_add(x, b) else: raise ValueError('Unsupported method: %s.' % (method)) return x
def text_layer(self, x, input_text): ''' text embbeding layers''' with tf.variable_scope('text'): embedding_chars_expanded = common_layers.embedding_look_up( input_text, self.vocab_size, self.netconf['embedding_dim']) h_pool_flat = common_layers.conv_pool( embedding_chars_expanded, list(map(int, self.netconf['filter_sizes'])), self.netconf['embedding_dim'], self.netconf['num_filters'], input_text.shape[1]) outputs = tf.concat((x, h_pool_flat), axis=1) return outputs
def get_eval_hooks(self, labels, logits): ''' lables: [batch] logits: [batch, num_classes] ''' eval_hooks = [] metric_tensor = {} with tf.variable_scope('metrics'): true_label = labels softmax = tf.nn.softmax(logits) pred_label = tf.argmax(softmax, -1) eval_metrics_ops = { 'accuracy': tf.metrics.accuracy( labels=true_label, predictions=pred_label, weights=None), 'auc': tf.metrics.auc( labels=true_label, predictions=softmax[:, -1], num_thresholds=20, curve='ROC', summation_method='trapezoidal'), 'precision': tf.metrics.precision( labels=true_label, predictions=pred_label, weights=None), 'recall': tf.metrics.recall( labels=true_label, predictions=pred_label, weights=None), 'tp': tf.metrics.true_positives( labels=true_label, predictions=pred_label, weights=None), 'fn': tf.metrics.false_negatives( labels=true_label, predictions=pred_label, weights=None), 'fp': tf.metrics.false_positives( labels=true_label, predictions=pred_label, weights=None), 'tn': tf.metrics.true_negatives( labels=true_label, predictions=pred_label, weights=None), } metric_tensor.update({key: val[0] for key, val in eval_metrics_ops.items()}) metric_hook = tf.train.LoggingTensorHook( tensors=metric_tensor, every_n_iter=10000, every_n_secs=None, at_end=False, formatter=None) eval_hooks.append(metric_hook) return eval_hooks, eval_metrics_ops
def call(self, features, **kwargs): ''' Implementation of __call__(). ''' self.train = kwargs['training'] feats = tf.identity(features['inputs'], name='feats') logging.info(features) if 'labels' in features: labels = features['labels'] else: # serving export mode labels = None with tf.variable_scope('model', reuse=tf.AUTO_REUSE): feats = self.preprocess(feats) logits = self.model(feats, labels) return logits
def linear(x, names, shapes, has_bias=True): """Linear Layer.""" assert len(shapes) == 2 with tf.variable_scope(names): weights = tf.get_variable(name='weights', shape=shapes, initializer=tf.initializers.glorot_uniform()) if has_bias: bias = tf.get_variable( name='bias', shape=shapes[1], initializer=tf.initializers.glorot_uniform()) return tf.matmul(x, weights) + bias else: return tf.matmul(x, weights)
def dense_layer(self, x): ''' fc layers''' with tf.variable_scope('dense'): shape = x.shape[-1].value y = common_layers.linear(x, 'dense-matmul', [shape, self.netconf['hidden1']]) if self.netconf['use_bn']: y = tf.layers.batch_normalization(y, axis=-1, momentum=0.99, training=self.train, name='dense-bn') y = tf.nn.relu6(y) if self.netconf['use_dropout']: y = tf.layers.dropout(y, self.netconf['dropout_rate'], training=self.train) return y
def linear_block(self, x): ''' linear layer for dim reduction x: shape [batch, time, feat, channel] output: shape [b, t, f] ''' with tf.variable_scope('linear'): times, feat, channel = x.shape.as_list()[1:] x = tf.reshape(x, [-1, feat * channel]) if self.netconf['use_dropout']: x = tf.layers.dropout(x, self.netconf['dropout_rate'], training=self.train) x = common_layers.linear( x, 'linear1', [feat * channel, self.netconf['linear_num']]) #x = tf.nn.relu6(x) x = tf.reshape(x, [-1, times, self.netconf['linear_num']]) return x
def splice_layer(x, name, context): ''' Splice a tensor along the last dimension with context. e.g.: t = [[[1, 2, 3], [4, 5, 6], [7, 8, 9]]] splice_tensor(t, [0, 1]) = [[[1, 2, 3, 4, 5, 6], [4, 5, 6, 7, 8, 9], [7, 8, 9, 7, 8, 9]]] Args: tensor: a tf.Tensor with shape (B, T, D) a.k.a. (N, H, W) context: a list of context offsets Returns: spliced tensor with shape (..., D * len(context)) ''' with tf.variable_scope(name): input_shape = tf.shape(x) B, T = input_shape[0], input_shape[1] context_len = len(context) array = tf.TensorArray(x.dtype, size=context_len) for idx, offset in enumerate(context): begin = offset end = T + offset if begin < 0: begin = 0 sliced = x[:, begin:end, :] tiled = tf.tile(x[:, 0:1, :], [1, abs(offset), 1]) final = tf.concat((tiled, sliced), axis=1) else: end = T sliced = x[:, begin:end, :] tiled = tf.tile(x[:, -1:, :], [1, abs(offset), 1]) final = tf.concat((sliced, tiled), axis=1) array = array.write(idx, final) spliced = array.stack() spliced = tf.transpose(spliced, (1, 2, 0, 3)) spliced = tf.reshape(spliced, (B, T, -1)) return spliced
def _build_attention(self, enc_outputs, enc_seq_len): with tf.variable_scope("AttentionMechanism"): if self.attn_Type == 'bahdanau': attention_mechanism = seq2seq.BahdanauAttention( num_units=2 * self.cell_dim, memory=enc_outputs, memory_sequence_length=enc_seq_len, probability_fn=tf.nn.softmax, normalize=True, dtype=tf.get_variable_scope().dtype) elif self.params['attention_type'] == 'luong': attention_mechanism = seq2seq.LuongAttention( num_units=2 * self.cell_dim, memory=enc_outputs, memory_sequence_length=enc_seq_len, probability_fn=tf.nn.softmax, dtype=tf.get_variable_scope().dtype) else: raise ValueError('Unknown Attention Type') return attention_mechanism
def conv_pool(embedded_chars_expanded, filter_sizes, embedding_size, num_filters, sequence_length): """ text conv and max pooling to get one-dimension vector to representation of text :param filter_sizes: :return: """ pooled_outputs = [] for _, filter_size in enumerate(filter_sizes): with tf.variable_scope("conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.get_variable(name='W', initializer=tf.truncated_normal(filter_shape, stddev=0.1)) b = tf.get_variable(name='b', initializer=tf.constant(0.1, shape=[num_filters])) conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) # Combine all the pooled features num_filters_total = num_filters * len(filter_sizes) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) return h_pool_flat
def linear_block(self, x): ''' linear layer for dim reduction x: shape [batch, time, feat, channel] output: shape [b, t, f] ''' batch_t = tf.shape(x)[0] time_t = tf.shape(x)[1] feat, channel = x.shape.as_list()[2:] linear_num = self.netconf['linear_num'] if linear_num > 0: with tf.variable_scope('linear'): x = tf.reshape(x, [batch_t * time_t, feat * channel]) if self.netconf['use_dropout']: x = tf.layers.dropout(x, self.netconf['dropout_rate'], training=self.train) x = common_layers.linear(x, 'linear1', [feat * channel, linear_num]) x = tf.nn.relu(x) if self.netconf['use_bn']: bn_name = 'bn_linear' x = tf.layers.batch_normalization(x, axis=-1, momentum=0.9, training=self.train, name=bn_name) x = tf.reshape(x, [batch_t, time_t, linear_num]) else: logging.info('linear_num <= 0, only apply reshape.') x = tf.reshape(x, [batch_t, time_t, feat * channel]) return x
def dense_layer(self, x): ''' Embedding layers. ''' with tf.variable_scope('dense'): shape = x.shape[-1].value hidden_dims = self.netconf['hidden_dims'] y = x use_bn = self.netconf['use_bn'] remove_nonlin = self.netconf['remove_last_nonlinearity'] for idx, hidden in enumerate(hidden_dims): last_layer = idx == (len(hidden_dims) - 1) layer_add_nonlin = not last_layer or not remove_nonlin y = common_layers.linear(y, 'dense-matmul-%d' % (idx + 1), [shape, hidden], has_bias=(layer_add_nonlin or not use_bn)) shape = hidden embedding = y if layer_add_nonlin: y = tf.nn.relu(y) if use_bn: y = tf.layers.batch_normalization(y, axis=-1, momentum=0.99, training=self.train, name='dense-bn-%d' % (idx + 1)) if self.netconf['use_dropout'] and layer_add_nonlin: y = tf.layers.dropout(y, self.netconf['dropout_rate'], training=self.train) if self.netconf['embedding_after_linear']: logging.info('Output embedding right after linear layer.') else: logging.info( 'Output embedding after non-lin, batch norm and dropout.') embedding = y return embedding, y
def preprocess(self, inputs, input_text): ''' preprocess speech and text inputs params: inputs: speech input input_text: text input ''' with tf.variable_scope('feature'): if self.input_type == 'samples': # speech feature config self.hp = speech_params( sr=self.taskconf['audio']['sr'], bins=self.audioconf['feature_size'], dither=self.train, use_delta_deltas=self.audioconf['add_delta_deltas'], cmvn=self.audioconf['cmvn'], cmvn_path=self.audioconf['cmvn_path']) feats = extract_feature(inputs, params=self.hp) else: self.mean, self.std = utils.load_cmvn( self.audioconf['cmvn_path']) feats = utils.apply_cmvn(inputs, self.mean, self.std) return feats, input_text
def logits_layer(self, x, labels): ''' Logits layer to further produce softmax. ''' if labels is None: # serving export mode, no need for logits return x output_num = self.taskconf['classes']['num'] logits_type = self.netconf['logits_type'] logits_shape = [x.shape[-1].value, output_num] with tf.variable_scope('logits'): init_type = self.netconf['logits_weight_init']['type'] if init_type == 'truncated_normal': stddev = self.netconf['logits_weight_init']['stddev'] init = tf.truncated_normal_initializer(stddev=stddev) elif init_type == 'xavier_uniform': init = tf.contrib.layers.xavier_initializer(uniform=True) elif init_type == 'xavier_norm': init = tf.contrib.layers.xavier_initializer(uniform=False) else: raise ValueError('Unsupported weight init type: %s' % (init_type)) weights = tf.get_variable(name='weights', shape=logits_shape, initializer=init) if logits_type == 'linear': bias = tf.get_variable( name='bias', shape=logits_shape[1], initializer=tf.constant_initializer(0.0)) return tf.matmul(x, weights) + bias elif logits_type == 'linear_no_bias': return tf.matmul(x, weights) elif logits_type == 'arcface': return self.arcface_layer(x, labels, output_num, weights)
def pooling_layer(self, x, time_len): ''' pooling layer''' with tf.variable_scope('time_pooling'): if self.attention: x, self.alphas = common_layers.attention( x, self.netconf['attention_size'], return_alphas=True) #alphas shape [batch, time, 1] -> [1, batch, time, 1]-> [1, time, batch, 1] tf.summary.image( 'alignment', tf.transpose(tf.expand_dims(self.alphas, 0), [0, 2, 1, 3])) else: if self.netconf['use_lstm_layer']: x = tf.concat(x, 2) # [batch, seq_len, dim, 1] x = tf.expand_dims(x, axis=-1) seq_len = time_len x = common_layers.max_pool(x, ksize=[seq_len, 1], strides=[seq_len, 1]) if self.netconf['use_lstm_layer']: x = tf.reshape(x, [-1, 2 * self.netconf['cell_num']]) else: x = tf.reshape(x, [-1, self.netconf['linear_num']]) return x
def arcface_loss(embedding, labels, out_num, weights=None, s=64., m=0.5, limit_to_pi=True): ''' https://github.com/auroua/InsightFace_TF/blob/master/losses/face_losses.py :param embedding: the input embedding vectors :param labels: the input labels, the shape should be eg: (batch_size, 1) :param s: scalar value default is 64 :param out_num: output class num :param weights: a tf.variable with shape (embedding.shape[-1], out_num) or None to make a new one internally. default = None :param m: the margin value, default is 0.5 :return: the final cacualted output, this output is send into the tf.nn.softmax directly ''' cos_m = math.cos(m) sin_m = math.sin(m) mm = sin_m * m # issue 1 threshold = math.cos(math.pi - m) with tf.variable_scope('arcface_loss'): # inputs and weights norm embedding_norm = tf.norm(embedding, axis=1, keep_dims=True) embedding = tf.div(embedding, embedding_norm, name='norm_embedding') if weights is None: weights = tf.get_variable( name='weights', shape=[embedding.shape[-1].value, out_num], initializer=tf.initializer.glorot_unifrom()) weights_norm = tf.norm(weights, axis=0, keep_dims=True) weights = tf.div(weights, weights_norm, name='norm_weights') # cos(theta+m) cos_t = tf.matmul(embedding, weights, name='cos_t') cos_t2 = tf.square(cos_t, name='cos_2') sin_t2 = tf.subtract(1., cos_t2, name='sin_2') sin_t = tf.sqrt(sin_t2, name='sin_t') cos_mt = s * tf.subtract(tf.multiply(cos_t, cos_m), tf.multiply(sin_t, sin_m), name='cos_mt') if limit_to_pi: # this condition controls the theta+m should in range [0, pi] # 0<=theta+m<=pi # -m<=theta<=pi-m cond_v = cos_t - threshold cond = tf.cast(tf.nn.relu(cond_v, name='if_else'), dtype=tf.bool) keep_val = s * (cos_t - mm) cos_mt_temp = tf.where(cond, cos_mt, keep_val) else: cos_mt_temp = cos_mt mask = tf.one_hot(labels, depth=out_num, name='one_hot_mask') # mask = tf.squeeze(mask, 1) inv_mask = tf.subtract(1., mask, name='inverse_mask') s_cos_t = tf.multiply(s, cos_t, name='scalar_cos_t') output = tf.add(tf.multiply(s_cos_t, inv_mask), tf.multiply(cos_mt_temp, mask), name='arcface_loss_output') return output