def forward(self, features, labels, mode, params): outputs = dict() is_training = (mode == tf.estimator.ModeKeys.TRAIN) for (feature_key, feature) in features.items(): if '/' not in feature_key: continue feature_key_fields = feature_key.split("/") feature_namespace = feature_key_fields[1].strip() field_name = feature_key_fields[0].strip() if feature_namespace == self._vocab_namespace: with tf.variable_scope("embedding/"+self._vocab_namespace, reuse=tf.AUTO_REUSE): input_ids = feature input_mask = None if self._mask_namespace: mask_feature_key = field_name+"/"+self._mask_namespace if mask_feature_key in features: input_mask = features[field_name+"/"+self._mask_namespace] else: logger.warning("The mask namespace %s with field name %s is not in features (%s)" % (self._mask_namespace, field_name, mask_feature_key)) if input_mask is None: input_length, input_mask = nn.length(input_ids) else: input_length, _ = nn.length(input_ids) model = BertModel( config=self._bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, use_one_hot_embeddings=self._use_one_hot_embeddings) embedding_output = model.get_sequence_output() if self._remove_bos_eos: embedding_output = nn.remove_bos_eos(embedding_output, input_length) dropout_rate = params.get('dropout_rate') if dropout_rate is None: dropout_rate = self._dropout_rate emb_drop = tf.layers.dropout(embedding_output, dropout_rate, training=is_training) if self._projection_dim: emb_drop = tf.layers.dense(emb_drop, self._projection_dim, use_bias=False, kernel_initializer=initializers.xavier_initializer()) outputs[feature_key] = emb_drop return outputs
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) h_s, c1 = nn.lstm(premise_tokens, self._hidden_dim, seq_len=prem_seq_lengths, name='premise') h_t, c2 = nn.lstm(hypothesis_tokens, self._hidden_dim, seq_len=hyp_seq_lengths, name='hypothesis') lstm_m = MatchLSTMCell(self._hidden_dim, h_s, prem_mask) k_m, _ = tf.nn.dynamic_rnn(lstm_m, h_t, hyp_seq_lengths, dtype=tf.float32) k_valid = select(k_m, hyp_seq_lengths) output_dict = self._make_output(k_valid, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) # metrics['auc'] = tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 #prem_mask = tf.expand_dims(prem_mask, -1) prem_mask = tf.cast(prem_mask, tf.bool) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) with tf.variable_scope('san_fb1'): x_fw1 = query_encode_san(premise_tokens, prem_mask, 'forward') # bs, ql, vec x_bw1 = query_encode_san(premise_tokens, prem_mask, 'backward') # bs, ql, vec x_fusion = fusion_gate(premise_tokens, prem_mask, x_fw1, x_bw1) # bs, ql, vec with tf.variable_scope('san_md'): x_code = query_encode_md(x_fusion, prem_mask) # bs, vec pre_logits = tf.nn.relu( linear(x_code, self._hidden_dim, True, scope='pre_logits_linear', is_train=True)) # bs, vec logits = linear(pre_logits, self._num_classes, False, scope='get_output', is_train=True) # bs, cn output_dict = self._make_output(logits, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) #tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward(features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get('hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError("The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError("The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get('premise/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get('premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get('hypothesis/elmo_characters', None) # 2.Input Encoder # 2.1 Highway Encoder query_emb = premise_tokens doc_emb = hypothesis_tokens query_len = prem_seq_lengths doc_len = hyp_seq_lengths query_mask = prem_mask doc_mask = hyp_mask project_dim = premise_tokens.shape[-1].value query_length = tf.shape(premise_tokens)[1] doc_length = tf.shape(hypothesis_tokens)[1] query_output = nn.highway_network(query_emb, 1, dropout_rate=self._dropout_rate, is_trainging=is_training, scope="query_highway") doc_output = nn.highway_network(doc_emb, 1, dropout_rate=self._dropout_rate, is_trainging=is_training, scope="doc_highway") # # 2.2 Co-Attention M = tf.Variable(tf.random_normal([project_dim, project_dim], stddev=0.1)) tmp = tf.einsum("ijk,kl->ijl", query_output, M) S = tf.matmul(tmp, doc_output, transpose_b=True) # [batch, q, d] S_mask = tf.matmul(query_mask, doc_mask, transpose_b=True) S_mean = S * S_mask # S_align_max = S + (1. - S_mask) * tf.float32.min # 2.2.1 Extractive Pooling # Max Pooling query_score = tf.nn.softmax(tf.reduce_max(S_align_max, axis=2, keepdims=True), axis=1) query_maxpooling = tf.reduce_sum(query_score * query_output, axis=1) # [batch, r] doc_score = tf.nn.softmax(tf.reduce_max(S_align_max, axis=1, keepdims=True), axis=2) doc_maxpooling = tf.reduce_sum(tf.transpose(doc_score, [0, 2, 1]) * doc_output, axis=1) # [batch, r] # Mean Pooling query_score = tf.nn.softmax(tf.reduce_sum(S_mean, axis=2, keepdims=True)/(tf.expand_dims(tf.expand_dims(tf.cast(doc_len, tf.float32)+self._eps, -1), -1)), axis=1) query_meanpooling = tf.reduce_sum(query_score * query_output, axis=1) # [batch, r] doc_score = tf.nn.softmax(tf.reduce_sum(S_mean, axis=1, keepdims=True)/(tf.expand_dims(tf.expand_dims(tf.cast(query_len, tf.float32)+self._eps, -1), -1)), axis=2) doc_meanpooling = tf.reduce_sum(tf.transpose(doc_score, [0, 2, 1]) * doc_output, axis=1) # [batch, r] # 2.2.2 Alignment Pooling query_alignment = tf.matmul(tf.nn.softmax(S_align_max, axis=2), doc_output) # [batch, q, r] doc_alignment = tf.matmul(tf.nn.softmax(S_align_max, axis=1), query_output, transpose_a=True) # [batch, d, r] # 2.2.3 Intra Attention query_selfattn = nn.self_attention(query_output, query_len) doc_selfattn = nn.self_attention(doc_output, doc_len) # 2.3 Multi-Cast Attention query_maxpooling = tf.tile(tf.expand_dims(query_maxpooling, axis=1), [1, query_length, 1]) query_meanpooling = tf.tile(tf.expand_dims(query_meanpooling, axis=1), [1, query_length, 1]) doc_maxpooling = tf.tile(tf.expand_dims(doc_maxpooling, axis=1), [1, doc_length, 1]) doc_meanpooling = tf.tile(tf.expand_dims(doc_meanpooling, axis=1), [1, doc_length, 1]) query_max_fc, query_max_fm, query_max_fs = self.cast_attention(query_maxpooling, query_emb, self.nn_fc, name="query_max_pooling") query_mean_fc, query_mean_fm, query_mean_fs = self.cast_attention(query_meanpooling, query_emb, self.nn_fc, name="query_mean_pooling") query_align_fcm, query_align_fm, query_align_fs = self.cast_attention(query_alignment, query_emb, self.nn_fc, name="query_align_pooling") query_selfattn_fc, query_selfattn_fm, query_selfattn_fs = self.cast_attention(query_selfattn, query_emb, self.nn_fc, name="query_self_pooling") doc_max_fc, doc_max_fm, doc_max_fs = self.cast_attention(doc_maxpooling, doc_emb, self.nn_fc, name="doc_max_pooling") doc_mean_fc, doc_mean_fm, doc_mean_fs = self.cast_attention(doc_meanpooling, doc_emb, self.nn_fc, name="doc_mean_pooling") doc_align_fcm, doc_align_fm, doc_align_fs = self.cast_attention(doc_alignment, doc_emb, self.nn_fc, name="doc_align_pooling") doc_selfattn_fc, doc_selfattn_fm, doc_selfattn_fs = self.cast_attention(doc_selfattn, doc_emb, self.nn_fc, name="doc_self_pooling") query_cast = tf.concat( [query_max_fc, query_max_fm, query_max_fs, query_mean_fc, query_mean_fm, query_mean_fs, query_align_fcm, query_align_fm, query_align_fs, query_selfattn_fc, query_selfattn_fm, query_selfattn_fs, query_output], axis=2) doc_cast = tf.concat( [doc_max_fc, doc_max_fm, doc_max_fs, doc_mean_fc, doc_mean_fm, doc_mean_fs, doc_align_fcm, doc_align_fm, doc_align_fs, doc_selfattn_fc, doc_selfattn_fm, doc_selfattn_fs, doc_output], axis=2) # query_cast = tf.concat( # [ # query_output], # axis=2) # doc_cast = tf.concat( # [doc_output], axis=2) query_cast = tf.layers.dropout(query_cast, self._dropout_rate, training=is_training) doc_cast = tf.layers.dropout(doc_cast, self._dropout_rate, training=is_training) query_hidden, _ = nn.bi_lstm(query_cast, self._hidden_dim, name="query_lstm") doc_hidden, _ = nn.bi_lstm(doc_cast, self._hidden_dim, name="doc_lstm") query_hidden = tf.concat(query_hidden, axis=2) doc_hidden = tf.concat(doc_hidden, axis=2) query_hidden = tf.layers.dropout(query_hidden, self._dropout_rate, training=is_training) doc_hidden = tf.layers.dropout(doc_hidden, self._dropout_rate, training=is_training) #query_hidden_max = query_hidden + (1. - query_mask) * tf.float32.min #doc_hidden_max = doc_hidden + (1. - doc_mask) * tf.float32.min query_hidden_mean = query_hidden * query_mask doc_hidden_mean = doc_hidden * doc_mask query_sum = tf.reduce_sum(query_hidden_mean, axis=1) query_mean = tf.div(query_sum, tf.expand_dims(tf.cast(query_len, tf.float32), -1) + self._eps) query_max = tf.reduce_max(query_hidden_mean, axis=1) query_final = tf.concat([query_mean, query_max], axis=1) doc_sum = tf.reduce_sum(doc_hidden_mean, axis=1) doc_mean = tf.div(doc_sum, tf.expand_dims(tf.cast(doc_len, tf.float32), -1) + self._eps) doc_max = tf.reduce_max(doc_hidden_mean, axis=1) doc_final = tf.concat([doc_mean, doc_max], axis=1) final = tf.concat([query_final, doc_final, query_final * doc_final, query_final - doc_final], axis=1) #yout = nn.highway_network(final, 2, dropout_rate=self._drop_rate, is_trainging=is_training) # MLP layer yout = tf.contrib.layers.fully_connected(final, self._hidden_dim, scope='fc1') # Dropout applied to classifier output_dict = self._make_output(yout, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError("The input features should contain label with vocabulary namespace " "labels int %s dataset."%mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy(labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision(labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall(labels=labels, predictions=output_dict['predictions']) output_dict['metrics'] = metrics # output_dict['debugs'] = [] # debug_ops = [query_mean_fs]#[query_maxpooling, query_max_fc] [query_max_fm, query_max_fs],[query_mean_fc, query_mean_fm] , , # for op in debug_ops: # output_dict['debugs'].append(tf.shape(op)) # output_dict['debugs'].append(query_length) return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) premise_outs, c1 = nn.bi_lstm(premise_tokens, self._hidden_dim, seq_len=prem_seq_lengths, name='premise') premise_bi = tf.concat(premise_outs, axis=2) premise_bi = premise_bi * prem_mask eps = 1e-11 ### Mean pooling premise_sum = tf.reduce_sum(premise_bi, 1) premise_ave = tf.div( premise_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1) + eps) # MLP layer h_mlp = tf.contrib.layers.fully_connected(premise_ave, self._hidden_dim, scope='fc1') # Dropout applied to classifier h_drop = tf.layers.dropout(h_mlp, self._dropout_rate, training=is_training) # Get prediction output_dict = self._make_output(h_drop, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) metrics['map'] = tf.metrics.average_precision_at_k( labels=tf.cast(labels, tf.int64), predictions=output_dict['logits'], k=2) metrics['precision_1'] = tf.metrics.precision_at_k( labels=tf.cast(labels, tf.int64), predictions=output_dict['logits'], k=1, class_id=1) #tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): global_step = tf.train.get_or_create_global_step() dropout_keep_rate = tf.train.exponential_decay(self._keep_prob, global_step, self._dropout_decay_step, self._dropout_decay_rate, staircase=False, name='dropout_keep_rate') tf.summary.scalar('dropout_keep_rate', dropout_keep_rate) params.add_hparam('dropout_rate', 1 - dropout_keep_rate) features_embedding = self._embedding_mapping.forward(features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get('hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError("The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError("The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get('premise/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_ins = [] hypothesis_ins = [] premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get('premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get('hypothesis/elmo_characters', None) premise_ins.append(premise_tokens) hypothesis_ins.append(hypothesis_tokens) premise_chars = features_embedding.get('premise/chars', None) hypothesis_chars = features_embedding.get('hypothesis/chars', None) if premise_chars is not None and hypothesis_chars is not None: with tf.variable_scope("conv") as scope: conv_pre = nn.multi_conv1d_max(premise_chars, self._char_filter_size, self._char_filter_channel_dims, "VALID", is_training, dropout_keep_rate, scope='conv') scope.reuse_variables() conv_hyp = nn.multi_conv1d_max(hypothesis_chars, self._char_filter_size, self._char_filter_channel_dims, "VALID", is_training, dropout_keep_rate, scope='conv') #conv_pre = tf.reshape(conv_pre, [-1, self.sequence_length, config.char_out_size]) #conv_hyp = tf.reshape(conv_hyp, [-1, self.sequence_length, config.char_out_size]) premise_ins.append(conv_pre) hypothesis_ins.append(conv_hyp) premise_pos = features_embedding.get('premise/pos_tags', None) hypothesis_pos = features_embedding.get('hypothesis/pos_tags', None) if premise_pos is not None and hypothesis_pos is not None: premise_ins.append(premise_pos) hypothesis_ins.append(hypothesis_pos) premise_exact_match = features.get('premise/exact_match_labels', None) hypothesis_exact_match = features.get('hypothesis/exact_match_labels', None) if premise_exact_match is not None and hypothesis_exact_match is not None: premise_ins.append(tf.expand_dims(tf.cast(premise_exact_match, tf.float32), -1)) hypothesis_ins.append(tf.expand_dims(tf.cast(hypothesis_exact_match, tf.float32), -1)) premise_in = tf.concat(premise_ins, axis=2) hypothesis_in = tf.concat(hypothesis_ins, axis=2) with tf.variable_scope("highway") as scope: premise_in = nn.highway_network(premise_in, self._highway_num_layers) scope.reuse_variables() hypothesis_in = nn.highway_network(hypothesis_in, self._highway_num_layers) with tf.variable_scope("prepro") as scope: pre = premise_in hyp = hypothesis_in for i in range(self._num_self_att_enc_layers): with tf.variable_scope("attention_encoder_%s" % i, reuse=False): pre_att = nn.self_attention(pre, prem_seq_lengths, func='tri_linear', scope="premise_self_attention") p = nn.fuse_gate(pre, pre_att, scope="premise_fuse_gate") hyp_att = nn.self_attention(hyp, hyp_seq_lengths, func='tri_linear', scope="hypothesis_self_attention") h = nn.fuse_gate(hyp, hyp_att, scope="hypothesis_fuse_gate") pre = p hyp = h nn.variable_summaries(p, "p_self_enc_summary_layer_{}".format(i)) nn.variable_summaries(h, "h_self_enc_summary_layer_{}".format(i)) with tf.variable_scope("main") as scope: pre = p hyp = h with tf.variable_scope("interaction"): pre_length = tf.shape(pre)[1] hyp_length = tf.shape(hyp)[1] pre_new = tf.tile(tf.expand_dims(pre, 2), [1, 1, hyp_length, 1]) hyp_new = tf.tile(tf.expand_dims(hyp, 1), [1, pre_length, 1, 1]) bi_att_mx = pre_new * hyp_new # mask = tf.expand_dims(tf.sequence_mask(query_len, tf.shape(query)[1], dtype=tf.float32), # axis=2) * \ # tf.expand_dims(tf.sequence_mask(key_len, tf.shape(key)[1], dtype=tf.float32), axis=1) bi_att_mx = tf.layers.dropout(bi_att_mx, 1-dropout_keep_rate, training=is_training) with tf.variable_scope("dense_net"): dim = bi_att_mx.get_shape().as_list()[-1] act = tf.nn.relu if self._first_scale_down_layer_relu else None fm = tf.contrib.layers.convolution2d(bi_att_mx, int(dim * self._dense_net_first_scale_down_ratio), self._first_scale_down_kernel, padding="SAME", activation_fn=act) fm = nn.dense_net_block(fm, self._dense_net_growth_rate, self._num_dense_net_layers, self._dense_net_kernel_size, scope="first_dense_net_block") fm = nn.dense_net_transition_layer(fm, self._dense_net_transition_rate, scope='second_transition_layer') fm = nn.dense_net_block(fm, self._dense_net_growth_rate, self._num_dense_net_layers, self._dense_net_kernel_size, scope="second_dense_net_block") fm = nn.dense_net_transition_layer(fm, self._dense_net_transition_rate, scope='third_transition_layer') fm = nn.dense_net_block(fm, self._dense_net_growth_rate, self._num_dense_net_layers, self._dense_net_kernel_size, scope="third_dense_net_block") fm = nn.dense_net_transition_layer(fm, self._dense_net_transition_rate, scope='fourth_transition_layer') shape_list = list(fm.get_shape()) #print(shape_list) premise_final = tf.reshape(fm, [-1, shape_list[1] * shape_list[2] * shape_list[3]]) output_dict = self._make_output(premise_final, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError("The input features should contain label with vocabulary namespace " "labels int %s dataset."%mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) #######l2 loss################# if self._l2_loss: if self._sigmoid_growing_l2loss: weights_added = tf.add_n([tf.nn.l2_loss(tensor) for tensor in tf.trainable_variables() if tensor.name.endswith("weights:0") or tensor.name.endswith('kernel:0') or tensor.name.endswith('filter:0')]) full_l2_step = tf.constant(self._weight_l2loss_step_full_reg, dtype=tf.int32, shape=[], name='full_l2reg_step') full_l2_ratio = tf.constant(self._l2_regularization_ratio, dtype=tf.float32, shape=[], name='l2_regularization_ratio') gs_flt = tf.cast(global_step, tf.float32) half_l2_step_flt = tf.cast(full_l2_step / 2, tf.float32) # (self.global_step - full_l2_step / 2) # tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32) # l2loss_ratio = tf.sigmoid( tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)) * full_l2_ratio l2loss_ratio = tf.sigmoid(((gs_flt - half_l2_step_flt) * 8) / half_l2_step_flt) * full_l2_ratio tf.summary.scalar('l2loss_ratio', l2loss_ratio) l2loss = weights_added * l2loss_ratio else: l2loss = tf.add_n([tf.nn.l2_loss(tensor) for tensor in tf.trainable_variables() if tensor.name.endswith("weights:0") or tensor.name.endswith( 'kernel:0')]) * tf.constant(self._l2_regularization_ratio, dtype='float', shape=[], name='l2_regularization_ratio') tf.summary.scalar('l2loss', l2loss) ######diff loss############################### diffs = [] for i in range(self._num_self_att_enc_layers): for tensor in tf.trainable_variables(): #print(tensor.name) if tensor.name == "diin/prepro/attention_encoder_{}/premise_self_attention/similar_mat/similar_func/arg/kernel:0".format( i): l_lg = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_self_attention/similar_mat/similar_func/arg/kernel:0".format( i): r_lg = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/lhs_1/kernel:0".format(i): l_fg_lhs_1 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/lhs_1/kernel:0".format( i): r_fg_lhs_1 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/rhs_1/kernel:0".format(i): l_fg_rhs_1 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/rhs_1/kernel:0".format( i): r_fg_rhs_1 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/lhs_2/kernel:0".format(i): l_fg_lhs_2 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/lhs_2/kernel:0".format( i): r_fg_lhs_2 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/rhs_2/kernel:0".format(i): l_fg_rhs_2 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/rhs_2/kernel:0".format( i): r_fg_rhs_2 = tensor if tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/lhs_3/kernel:0".format( i): l_fg_lhs_3 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/lhs_3/kernel:0".format( i): r_fg_lhs_3 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/premise_fuse_gate/rhs_3/kernel:0".format( i): l_fg_rhs_3 = tensor elif tensor.name == "diin/prepro/attention_encoder_{}/hypothesis_fuse_gate/rhs_3/kernel:0".format( i): r_fg_rhs_3 = tensor diffs += [l_lg - r_lg, l_fg_lhs_1 - r_fg_lhs_1, l_fg_rhs_1 - r_fg_rhs_1, l_fg_lhs_2 - r_fg_lhs_2, l_fg_rhs_2 - r_fg_rhs_2] diffs += [l_fg_lhs_3 - r_fg_lhs_3, l_fg_rhs_3 - r_fg_rhs_3] diff_loss = tf.add_n([tf.nn.l2_loss(tensor) for tensor in diffs]) * tf.constant( self._diff_penalty_loss_ratio, dtype='float', shape=[], name='diff_penalty_loss_ratio') tf.summary.scalar('diff_loss', diff_loss) ############################### output_dict['loss'] = loss + l2loss + diff_loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy(labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision(labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall(labels=labels, predictions=output_dict['predictions']) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 # prem_mask = tf.expand_dims(prem_mask, -1) # hyp_mask = tf.expand_dims(hyp_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) with tf.variable_scope("Attend"): F_a_bar = self._feedForwardBlock(premise_tokens, self._hidden_dim, 'F', is_training=is_training) F_b_bar = self._feedForwardBlock(hypothesis_tokens, self._hidden_dim, 'F', isReuse=True, is_training=is_training) # e_i,j = F'(a_hat, b_hat) = F(a_hat).T * F(b_hat) (1) #alignment_attention = Attention(self.hidden_size, self.hidden_size) #alpha = alignment_attention(F_b_bar, F_a_bar, keys_mask=self.query_mask) #beta = alignment_attention(F_a_bar, F_b_bar, keys_mask=self.doc_mask) alpha, beta = nn.bi_uni_attention(F_a_bar, F_b_bar, query_len=prem_seq_lengths, key_len=hyp_seq_lengths) with tf.variable_scope("Compare"): a_beta = tf.concat([premise_tokens, alpha], axis=2) b_alpha = tf.concat([hypothesis_tokens, beta], axis=2) # v_1,i = G([a_bar_i, beta_i]) # v_2,j = G([b_bar_j, alpha_j]) (3) v_1 = self._feedForwardBlock(a_beta, self._hidden_dim, 'G', is_training=is_training) v_2 = self._feedForwardBlock(b_alpha, self._hidden_dim, 'G', isReuse=True, is_training=is_training) with tf.variable_scope("Aggregate"): # v1 = \sum_{i=1}^l_a v_{1,i} # v2 = \sum_{j=1}^l_b v_{2,j} (4) v1_sum = tf.reduce_sum(v_1, axis=1) v2_sum = tf.reduce_sum(v_2, axis=1) # y_hat = H([v1, v2]) (5) v = tf.concat([v1_sum, v2_sum], axis=1) ff_outputs = self._feedForwardBlock(v, self._hidden_dim, 'H', is_training=is_training) output_dict = self._make_output(ff_outputs, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) #metrics['auc'] = tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [tf.shape(hypothesis_tokens), tf.shape(premise_tokens), # tf.shape(alpha), tf.shape(beta)] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward(features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get('hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError("The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError("The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get('premise/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance(self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get('premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get('hypothesis/elmo_characters', None) lm_xor = keras.layers.Lambda(self._xor_match)([premise_tokens_ids, hypothesis_tokens_ids]) lm_conv = keras.layers.Conv1D( self._lm_filters, premise_tokens_ids.shape[1].value, padding='valid', activation=self._activation_func )(lm_xor) lm_conv = keras.layers.Dropout(self._dropout_rate)( lm_conv, training=is_training) lm_feat = keras.layers.Reshape((lm_conv.shape[2].value, ))(lm_conv) for hidden_size in self._lm_hidden_sizes: lm_feat = keras.layers.Dense( hidden_size, activation=self._activation_func )(lm_feat) lm_drop = keras.layers.Dropout(self._dropout_rate)( lm_feat, training=is_training) lm_score = keras.layers.Dense(1)(lm_drop) dm_q_conv = keras.layers.Conv1D( self._dm_filters, self._dm_kernel_size, padding='same', activation=self._activation_func )(premise_tokens) dm_q_conv = keras.layers.Dropout(self._dropout_rate)( dm_q_conv, training=is_training) dm_q_mp = keras.layers.MaxPooling1D( pool_size=premise_tokens_ids.shape[1].value)(dm_q_conv) dm_q_rep = keras.layers.Reshape((dm_q_mp.shape[2].value, ))(dm_q_mp) dm_q_rep = keras.layers.Dense(self._dm_q_hidden_size)( dm_q_rep) dm_q_rep = keras.layers.Lambda(lambda x: tf.expand_dims(x, 1))( dm_q_rep) dm_d_conv1 = keras.layers.Conv1D( self._dm_filters, self._dm_kernel_size, padding='same', activation=self._activation_func )(hypothesis_tokens) dm_d_conv1 = keras.layers.Dropout(self._dropout_rate)( dm_d_conv1, training=is_training) dm_d_mp = keras.layers.MaxPooling1D( pool_size=self._dm_d_mpool)(dm_d_conv1) dm_d_conv2 = keras.layers.Conv1D( self._dm_filters, 1, padding='same', activation=self._activation_func )(dm_d_mp) dm_d_conv2 = keras.layers.Dropout(self._dropout_rate)( dm_d_conv2, training=is_training) h_dot = dm_q_rep * dm_d_conv2 #keras.layers.Lambda(self._hadamard_dot)([dm_q_rep, dm_d_conv2]) dm_feat = keras.layers.Reshape((h_dot.shape[1].value*h_dot.shape[2].value, ))(h_dot) for hidden_size in self._dm_hidden_sizes: dm_feat = keras.layers.Dense(hidden_size)(dm_feat) dm_feat_drop = keras.layers.Dropout(self._dropout_rate)( dm_feat, training=is_training) dm_score = keras.layers.Dense(1)(dm_feat_drop) add = keras.layers.Add()([lm_score, dm_score]) # Get prediction output_dict = self._make_output(add, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError("The input features should contain label with vocabulary namespace " "labels int %s dataset."%mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy(labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision(labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall(labels=labels, predictions=output_dict['predictions']) # metrics['map'] = tf.metrics.average_precision_at_k(labels=tf.cast(labels, tf.int64), predictions=output_dict['logits'], # k=2) # metrics['precision_1'] = tf.metrics.precision_at_k(labels=tf.cast(labels, tf.int64), predictions=output_dict['logits'], # k=1, class_id=1) #tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) #########Word Embedding#################### premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_ins = [] hypothesis_ins = [] premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) premise_ins.append(premise_tokens) hypothesis_ins.append(hypothesis_tokens) premise_chars = features_embedding.get('premise/chars', None) hypothesis_chars = features_embedding.get('hypothesis/chars', None) if premise_chars is not None and hypothesis_chars is not None: with tf.variable_scope("conv") as scope: conv_pre = nn.multi_conv1d_max( premise_chars, self._char_filter_size, self._char_filter_channel_dims, "VALID", is_training, self._dropout_rate, scope='conv') scope.reuse_variables() conv_hyp = nn.multi_conv1d_max( hypothesis_chars, self._char_filter_size, self._char_filter_channel_dims, "VALID", is_training, self._dropout_rate, scope='conv') # conv_pre = tf.reshape(conv_pre, [-1, self.sequence_length, config.char_out_size]) # conv_hyp = tf.reshape(conv_hyp, [-1, self.sequence_length, config.char_out_size]) premise_ins.append(conv_pre) hypothesis_ins.append(conv_hyp) premise_pos = features_embedding.get('premise/pos_tags', None) hypothesis_pos = features_embedding.get('hypothesis/pos_tags', None) if premise_pos is not None and hypothesis_pos is not None: premise_ins.append(premise_pos) hypothesis_ins.append(hypothesis_pos) premise_exact_match = features.get('premise/exact_match_labels', None) hypothesis_exact_match = features.get( 'hypothesis/exact_match_labels', None) if premise_exact_match is not None and hypothesis_exact_match is not None: premise_ins.append( tf.expand_dims(tf.cast(premise_exact_match, tf.float32), -1)) hypothesis_ins.append( tf.expand_dims(tf.cast(hypothesis_exact_match, tf.float32), -1)) premise_in = tf.concat(premise_ins, axis=2) hypothesis_in = tf.concat(hypothesis_ins, axis=2) premise_in = nn.highway_network(premise_in, 2, output_size=self._hidden_dim, dropout_rate=self._dropout_rate, is_trainging=is_training, scope="premise_highway") hypothesis_in = nn.highway_network(hypothesis_in, 2, output_size=self._hidden_dim, dropout_rate=self._dropout_rate, is_trainging=is_training, scope="hypothesis_highway") ########Attention Stack-GRU################ def gru_network(input, input_len, name="gru_network"): with tf.variable_scope(name): gru_input = input for i in range(self._num_rnn_layer): with tf.variable_scope("layer_%s" % i): seq, c1 = nn.gru(gru_input, self._hidden_dim, seq_len=input_len, initializer=self._initializer) gru_input = tf.concat([gru_input, seq], axis=2) return gru_input premise_gru = gru_network(premise_in, prem_seq_lengths, name='premise_gru_network') hypothesis_gru = gru_network(hypothesis_in, hyp_seq_lengths, name='hypothesis_gru_network') premise_gru = premise_gru * prem_mask hypothesis_gru = hypothesis_gru * hyp_mask ######### premise_att = nn.attention_pool(premise_gru, self._hidden_dim, seq_len=prem_seq_lengths, initializer=self._initializer, name='premise_attention_pool') hypothesis_att = nn.attention_pool( hypothesis_gru, self._hidden_dim, seq_len=hyp_seq_lengths, initializer=self._initializer, name='hypothesis_attention_pool') ############Dynamic Re-read Mechanism################ def dynamic_reread(h_seq_a, h_a, h_b, h_a_len, name="dymanic_reread"): with tf.variable_scope(name): h_a_pre = h_a # h_a_pre = nn.highway_layer(h_a, self._hidden_dim, initializer=self._initializer, # scope="h_a_pre_highway") # h_seq_a = nn.highway_layer(h_seq_a, self._hidden_dim, initializer=self._initializer, # scope="h_seq_a_highway") # h_b = nn.highway_layer(h_b, self._hidden_dim, initializer=self._initializer, # scope="h_b_highway") ##### w_d = tf.get_variable( "w_d_weights", (h_seq_a.shape[-1].value, h_a_pre.shape[-1].value), initializer=self._initializer) u_d = tf.get_variable( "u_d_weights", (h_a_pre.shape[-1].value, h_a_pre.shape[-1].value), initializer=self._initializer) m_d = tf.get_variable( "m_d_weights", (h_b.shape[-1].value, h_a_pre.shape[-1].value), initializer=self._initializer) omega_d = tf.get_variable("omega_d_weights", (h_a_pre.shape[-1].value, 1), initializer=self._initializer) ########## m_d_h_b = tf.tensordot(h_b, m_d, axes=[-1, 0]) h_seq_a_w_d = tf.tensordot(h_seq_a, w_d, axes=[-1, 0]) if h_a_len is not None: mask = tf.expand_dims(tf.sequence_mask( h_a_len, tf.shape(h_seq_a)[1], dtype=tf.float32), axis=2) else: mask = None gru_cell = tf.nn.rnn_cell.GRUCell( h_a_pre.shape[-1].value, kernel_initializer=self._initializer) for i in range(self._reread_length): u_d_h_a_pre = tf.tensordot(h_a_pre, u_d, axes=[-1, 0]) m_a = tf.nn.tanh( h_seq_a_w_d + tf.expand_dims(m_d_h_b + u_d_h_a_pre, 1)) m_a = tf.tensordot(m_a, omega_d, axes=[-1, 0]) if mask is not None: m_a = m_a + (1. - mask) * tf.float32.min alpha = tf.nn.softmax(self._beta * m_a, axis=1) alpha = tf.reduce_sum(alpha * h_seq_a, axis=1) gru_output, gru_state = gru_cell(alpha, h_a_pre) h_a_pre = gru_state return gru_output premise_v = dynamic_reread(premise_gru, premise_att, hypothesis_att, prem_seq_lengths, name='premise_dynamic_reread') hypothesis_v = dynamic_reread(hypothesis_gru, hypothesis_att, premise_att, hyp_seq_lengths, name='hypothesis_dynamic_reread') ########label prediction############## h = tf.concat([ premise_att, hypothesis_att, hypothesis_att * premise_att, hypothesis_att - premise_att ], axis=-1) v = tf.concat([ premise_v, hypothesis_v, hypothesis_v * premise_v, hypothesis_v - premise_v ], axis=-1) # h MLP layer h_mlp = tf.layers.dense(h, self._hidden_dim, activation=tf.nn.relu, kernel_initializer=self._initializer, name='h_fc1') # Dropout applied to classifier h_drop = tf.layers.dropout(h_mlp, self._dropout_rate, training=is_training) # Get prediction h_logits = tf.layers.dense(h_drop, self._num_classes, activation=None, kernel_initializer=self._initializer, name='h_logits') p_h = tf.nn.softmax(h_logits) # # MLP layer v_mlp = tf.layers.dense(v, self._hidden_dim, activation=tf.nn.relu, kernel_initializer=self._initializer, name='v_fc1') # Dropout applied to classifier v_drop = tf.layers.dropout(v_mlp, self._dropout_rate, training=is_training) # Get prediction v_logits = tf.layers.dense(v_drop, self._num_classes, activation=None, kernel_initializer=self._initializer, name='v_logits') p_v = tf.nn.softmax(v_logits) #### alpha_h = tf.layers.dense(h, 1, activation=tf.nn.sigmoid, kernel_initializer=self._initializer, bias_initializer=tf.zeros_initializer()) alpha_v = tf.layers.dense(v, 1, activation=tf.nn.sigmoid, kernel_initializer=self._initializer, bias_initializer=tf.zeros_initializer()) # # h MLP layer fuse_mlp = tf.layers.dense(alpha_h * h + alpha_v * v, self._hidden_dim, activation=tf.nn.relu, kernel_initializer=self._initializer, name='fuse_fc1') # Dropout applied to classifier fuse_drop = tf.layers.dropout(fuse_mlp, self._dropout_rate, training=is_training) #Get prediction output_dict = self._make_output(fuse_drop, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] h_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=labels_embedding, logits=h_logits)) v_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=labels_embedding, logits=v_logits)) fuse_loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = v_loss + h_loss + fuse_loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) dense_output = tf.layers.dense(premise_tokens, 1, use_bias=False) dense_output += (1 - prem_mask) * tf.float32.min attention_probs = tf.nn.softmax(dense_output, axis=1) # Matching histogram of top-k # shape = [B, M, N] matching_matrix = tf.matmul(tf.nn.l2_normalize(premise_tokens, axis=2), tf.nn.l2_normalize(hypothesis_tokens, axis=2), transpose_b=True) # shape = [B, M, K] matching_topk = tf.nn.top_k(matching_matrix, k=self._top_k, sorted=True)[0] # Feedforward matching topk # shape = [B, M, 1] dense_output = matching_topk for i in range(self._mlp_num_layers): dense_output = tf.layers.Dense( self._mlp_num_units, activation=self._mlp_activation_func, use_bias=True)(dense_output) dense_output = tf.layers.Dense( self._mlp_num_fan_out, activation=self._mlp_activation_func, use_bias=True)(dense_output) # shape = [B, 1, 1] dot_score = tf.matmul(attention_probs, dense_output, transpose_a=True) flatten_score = tf.reshape(dot_score, [-1, 1]) # Get prediction output_dict = self._make_output(flatten_score, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) # metrics['map'] = tf.metrics.average_precision_at_k(labels=tf.cast(labels, tf.int64), predictions=output_dict['logits'], # k=2) # metrics['precision_1'] = tf.metrics.precision_at_k(labels=tf.cast(labels, tf.int64), predictions=output_dict['logits'], # k=1, class_id=1) #tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): if self._sim_func != 'tensor' and self._num_tensor_dim != 1: self._num_tensor_dim = 1 logger.warning( "The similarity function is tensor layer. The number of tensor dim is not effective." ) features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) prem_hyp_mask = tf.matmul(prem_mask, hyp_mask, transpose_b=True) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) premise_outs, c1 = nn.bi_lstm(premise_tokens, self._hidden_dim, seq_len=prem_seq_lengths, name='premise') hypothesis_outs, c2 = nn.bi_lstm(hypothesis_tokens, self._hidden_dim, seq_len=hyp_seq_lengths, name='hypothesis') premise_bi = tf.concat(premise_outs, axis=2) hypothesis_bi = tf.concat(hypothesis_outs, axis=2) max_premise_length = premise_tokens.shape[1].value max_hypothesis_length = hypothesis_tokens.shape[1].value if self._sim_func == 'tensor': M = tf.Variable( tf.random_normal([ self._num_tensor_dim, 2 * self._hidden_dim, 2 * self._hidden_dim ], stddev=0.1)) W = tf.Variable( tf.random_normal([4 * self._hidden_dim, 1], stddev=0.1)) bias = tf.Variable(tf.zeros([1]), name="tensor_bias") premise_ex = tf.tile(tf.expand_dims(premise_bi, axis=2), [1, 1, max_hypothesis_length, 1]) hypothesis_ex = tf.tile(tf.expand_dims(hypothesis_bi, axis=1), [1, max_premise_length, 1, 1]) tensor = [] tmp2 = tf.einsum("abcd,df->abcf", tf.concat([premise_ex, hypothesis_ex], axis=3), W) # [N, L1, L2, 1] tmp2 = tf.squeeze(tmp2, axis=3) for i in range(self._num_tensor_dim): tmp1 = tf.einsum("abc,cd->abd", premise_bi, M[i]) # [N, L1, 2d] tmp1 = tf.matmul(tmp1, hypothesis_bi, transpose_b=True) # [N, L1, L2] tensor.append(tf.nn.relu(tmp1 + tmp2 + bias)) tensor = tf.concat([tensor], axis=0) elif self._sim_func == 'cosine': tensor = tf.matmul(tf.nn.l2_normalize(premise_bi, axis=-1), tf.nn.l2_normalize(hypothesis_bi, axis=-1), transpose_b=True) # [N, L1, L2] elif self._sim_func == 'bilinear': M = tf.Variable( tf.random_normal( [2 * self._hidden_dim, 2 * self._hidden_dim], stddev=0.1)) b = tf.Variable( tf.random_normal( [max_premise_length, max_hypothesis_length], stddev=0.1)) bilinear = tf.einsum("abc,cd->abd", premise_bi, M) # [N, L1, 2d] tensor = tf.matmul(bilinear, hypothesis_bi, transpose_b=True) + b # [N, L1, L2] else: raise ConfigureError( "The simility function %s is not supported. " "The mvlstm only support simility function for [cosine, bilinear, tensor]." % self._sim_func) tensor *= prem_hyp_mask # 3.1 k-Max Pooling matrix_in = tf.reshape( tensor, [-1, max_premise_length * max_hypothesis_length]) values, indices = tf.nn.top_k(matrix_in, k=self._num_k, sorted=False) kmax = tf.reshape(values, [-1, self._num_tensor_dim * self._num_k]) # MLP layer h_mlp_1 = tf.contrib.layers.fully_connected(kmax, self._num_tensor_dim * self._num_k, scope='fc1') h_mlp_1_drop = tf.layers.dropout(h_mlp_1, self._dropout_rate, training=is_training) h_mlp_2 = tf.contrib.layers.fully_connected(h_mlp_1_drop, self._num_tensor_dim * self._num_k // 2, scope='fc2') # Dropout applied to classifier h_drop = tf.layers.dropout(h_mlp_2, self._dropout_rate, training=is_training) # Get prediction output_dict = self._make_output(h_drop, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) #tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # premise_ave, hypothesis_ave, diff, mul, h, h_mlp, logits] return output_dict
def forward(self, features, labels, mode, params): features_embedding = self._embedding_mapping.forward( features, labels, mode, params) with tf.variable_scope(self._model_name): is_training = (mode == tf.estimator.ModeKeys.TRAIN) premise_tokens_ids = features.get('premise/tokens', None) if premise_tokens_ids is None: premise_tokens_ids = features.get('premise/elmo_characters', None) hypothesis_tokens_ids = features.get('hypothesis/tokens', None) if hypothesis_tokens_ids is None: hypothesis_tokens_ids = features.get( 'hypothesis/elmo_characters', None) if premise_tokens_ids is None: raise ConfigureError( "The input features should contain premise with vocabulary namespace tokens " "or elmo_characters.") if hypothesis_tokens_ids is None: raise ConfigureError( "The input features should contain hypothesis with vocabulary namespace tokens " "or elmo_characters.") prem_seq_lengths, prem_mask = nn.length(premise_tokens_ids) hyp_seq_lengths, hyp_mask = nn.length(hypothesis_tokens_ids) if features.get( 'premise/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): prem_mask = nn.remove_bos_eos(prem_mask, prem_seq_lengths) prem_seq_lengths -= 2 if features.get('hypothesis/elmo_characters', None) is not None or isinstance( self._embedding_mapping.get_encoder('tokens'), Bert): hyp_mask = nn.remove_bos_eos(hyp_mask, hyp_seq_lengths) hyp_seq_lengths -= 2 prem_mask = tf.expand_dims(prem_mask, -1) hyp_mask = tf.expand_dims(hyp_mask, -1) premise_tokens = features_embedding.get('premise/tokens', None) if premise_tokens is None: premise_tokens = features_embedding.get( 'premise/elmo_characters', None) hypothesis_tokens = features_embedding.get('hypothesis/tokens', None) if hypothesis_tokens is None: hypothesis_tokens = features_embedding.get( 'hypothesis/elmo_characters', None) premise_outs, c1 = nn.bi_lstm(premise_tokens, self._hidden_dim, seq_len=prem_seq_lengths, name='premise') hypothesis_outs, c2 = nn.bi_lstm(hypothesis_tokens, self._hidden_dim, seq_len=hyp_seq_lengths, name='hypothesis') premise_bi = tf.concat(premise_outs, axis=2) hypothesis_bi = tf.concat(hypothesis_outs, axis=2) premise_bi *= prem_mask hypothesis_bi *= hyp_mask ### Attention ### premise_attns, hypothesis_attns = nn.bi_uni_attention( premise_bi, hypothesis_bi, prem_seq_lengths, hyp_seq_lengths, func="dot") # For making attention plots, prem_diff = tf.subtract(premise_bi, premise_attns) prem_mul = tf.multiply(premise_bi, premise_attns) hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns) hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns) m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2) m_b = tf.concat( [hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul], 2) ### Inference Composition ### v1_outs, c3 = nn.bi_lstm(m_a, self._hidden_dim, seq_len=prem_seq_lengths, name='v1') v2_outs, c4 = nn.bi_lstm(m_b, self._hidden_dim, seq_len=hyp_seq_lengths, name='v2') v1_bi = tf.concat(v1_outs, axis=2) v2_bi = tf.concat(v2_outs, axis=2) v1_bi = v1_bi * prem_mask v2_bi = v2_bi * hyp_mask ### Pooling Layer ### eps = 1e-11 v_1_sum = tf.reduce_sum(v1_bi, 1) v_1_ave = tf.div( v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1) + eps) v_2_sum = tf.reduce_sum(v2_bi, 1) v_2_ave = tf.div( v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1) + eps) v_1_max = tf.reduce_max(v1_bi, 1) v_2_max = tf.reduce_max(v2_bi, 1) v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1) # MLP layer h_mlp = tf.contrib.layers.fully_connected(v, self._hidden_dim, activation_fn=tf.nn.tanh, scope='fc1') # Dropout applied to classifier h_drop = tf.layers.dropout(h_mlp, self._dropout_rate, training=is_training) # Get prediction output_dict = self._make_output(h_drop, params) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: if 'label/labels' not in features: raise ConfigureError( "The input features should contain label with vocabulary namespace " "labels int %s dataset." % mode) labels_embedding = features_embedding['label/labels'] labels = features['label/labels'] loss = self._make_loss(labels=labels_embedding, logits=output_dict['logits'], params=params) output_dict['loss'] = loss metrics = dict() metrics['accuracy'] = tf.metrics.accuracy( labels=labels, predictions=output_dict['predictions']) metrics['precision'] = tf.metrics.precision( labels=labels, predictions=output_dict['predictions']) metrics['recall'] = tf.metrics.recall( labels=labels, predictions=output_dict['predictions']) # metrics['auc'] = tf.metrics.auc(labels=labels, predictions=predictions) output_dict['metrics'] = metrics # output_dict['debugs'] = [hypothesis_tokens, premise_tokens, hypothesis_bi, premise_bi, # v_1_ave, v_2_ave, h_mlp, logits] return output_dict