def _preprocess(self, Xw, Xw_len, Xc, Xc_len, scope="preprocess_layers", reuse=False): with tf.variable_scope(scope, reuse=reuse): Xw_embedded, size_w = embedded(Xw, self.embeddings_w[0], self.embeddings_w[1], self.config.wv_config["train_w"], scope="embedded_w") Xc_embedded, size_c = embedded(Xc, self.embeddings_c[0], self.embeddings_c[1], self.config.wv_config["train_c"], scope="embedded_c") batch_size, seq_len = tf.shape(Xw)[0], tf.shape(Xw)[1] Xc_embedded = tf.reshape(Xc_embedded, shape=[batch_size * seq_len, -1, size_c]) Xc_len = tf.reshape(Xc_len, shape=[ batch_size * seq_len, ]) Xc_embedded, size_c = bi_gru(Xc_embedded, Xc_len, (self.config.char_dim, ), 2, self.initializer, 1.0, "bi_gru_c2w") Xc_embedded = tf.reshape(Xc_embedded, shape=[batch_size, seq_len, size_c]) X_embedded = tf.concat([Xw_embedded, Xc_embedded], axis=-1) out_w, out_w_size = bi_gru(X_embedded, Xw_len, (self.config.bi_dim, ), 1, self.initializer, 1.0, "bi_gru__wc") return out_w, out_w_size
def _encode(self, Xw, Xw_l, Xc, Xc_l, scope="encode_layers", reuse=False): with tf.variable_scope(scope, reuse=reuse): Xw_embedded, size_w = embedded(Xw, self.embeddings_w[0], self.embeddings_w[1], self.config.wv_config["train_w"], scope="embedded_w") Xc_embedded, size_c = embedded(Xc, self.embeddings_c[0], self.embeddings_c[1], self.config.wv_config["train_c"], scope="embedded_c") batch_size = tf.shape(Xw)[0] # char v0, v0_size = attention_han(Xc_embedded, self.config.un_dim, self.initializer, "attention_han_c") v1, v1_size = bi_gru(Xc_embedded, Xc_l, (self.config.bi_dim, ), 2, self.initializer, 1.0, "bi_gru_c") char_v = tf.reshape(tf.concat([v0, v1], axis=-1), [batch_size, v0_size + v1_size]) # word v0, v0_size = attention_han(Xw_embedded, self.config.un_dim, self.initializer, "attention_han_w") v1, v1_size = bi_gru(Xw_embedded, Xw_l, (self.config.bi_dim, ), 2, self.initializer, 1.0, "bi_gru_w") word_v = tf.reshape(tf.concat([v0, v1], axis=-1), [batch_size, v0_size + v1_size]) # phrase Xp_embedded, size_p = conv_with_max_pool(Xw_embedded, (2, 3, 4, 5), size_w // 4, False, tf.nn.selu, self.initializer, "conv_w2p") v0, v0_size = attention_han(Xp_embedded, self.config.un_dim, self.initializer, "attention_han_p") v1, v1_size = bi_gru(Xp_embedded, Xw_l, (self.config.bi_dim, ), 2, self.initializer, 1.0, "bi_gru_p") phrase_v = tf.reshape(tf.concat([v0, v1], axis=-1), [batch_size, v0_size + v1_size]) return char_v, word_v, phrase_v
def build_graph(self): self.graph = tf.Graph() with self.graph.as_default(): with tf.variable_scope("placeholders"): self.X1w = tf.placeholder(dtype=tf.int32, shape=[None, None], name="sent1w_ph") self.X2w = tf.placeholder(dtype=tf.int32, shape=[None, None], name="sent2w_ph") self.X1c = tf.placeholder(dtype=tf.int32, shape=[None, None, None], name="sent1c_ph") self.X2c = tf.placeholder(dtype=tf.int32, shape=[None, None, None], name="sent2c_ph") self.y = tf.placeholder(dtype=tf.int32, shape=[ None, ], name="label_ph") self.keep_prob = tf.placeholder_with_default( 1.0, shape=[], name="keep_prob_ph") self.X1w_mask = tf.sign(self.X1w, name="sent1w_mask") self.X2w_mask = tf.sign(self.X2w, name="sent2w_mask") self.X1c_mask = tf.sign(self.X1c, name="sent1c_mask") self.X2c_mask = tf.sign(self.X2c, name="sent2c_mask") self.X1w_l = tf.reduce_sum(self.X1w_mask, axis=-1, name="sent1w_len") self.X2w_l = tf.reduce_sum(self.X2w_mask, axis=-1, name="sent2w_len") self.X1c_l = tf.reduce_sum(self.X1c_mask, axis=-1, name="sent1c_len") self.X2c_l = tf.reduce_sum(self.X2c_mask, axis=-1, name="sent2c_len") X1_f, X1_b = self._preprocess(self.X1w, self.X1w_l, self.X1c, self.X1c_l, scope="preprocess_layers") X2_f, X2_b = self._preprocess(self.X2w, self.X2w_l, self.X2c, self.X2c_l, scope="preprocess_layers", reuse=True) with tf.variable_scope("match_layers"): # Shapes: (batch_size, num_sentence_words, 8*multi-perspective_dims) match_1_to_2_out, match_2_to_1_out = bilateral_matching( X1_f, X1_b, X2_f, X2_b, self.X1w_mask, self.X2w_mask, self.keep_prob, self.config.mp_dim) # Aggregate the representations from the matching functions. with tf.variable_scope("aggregate_layers"): seq_1_fb, _ = bi_gru(match_1_to_2_out, self.X1w_l, (self.config.bi_dim, ), 2, self.initializer, 1.0, "bi_gru") seq_2_fb, _ = bi_gru(match_2_to_1_out, self.X2w_l, (self.config.bi_dim, ), 2, self.initializer, 1.0, "bi_gru", reuse=True) combined_aggregated_representation = tf.concat( [seq_1_fb, seq_2_fb], -1) with tf.variable_scope("fc_layers"): h = tf.nn.dropout(combined_aggregated_representation, keep_prob=self.keep_prob) h = tf.layers.dense(h, self.config.un_dim, activation=tf.nn.selu, kernel_initializer=self.initializer) h = tf.nn.dropout(h, keep_prob=self.keep_prob) pi = 0.01 self.logits = tf.layers.dense( h, 1, kernel_initializer=self.initializer, bias_initializer=tf.constant_initializer(-np.log((1 - pi) / pi))) self.pos_prob = tf.nn.sigmoid(self.logits) self.var_list = [v for v in tf.global_variables()] if self.config.fine_tune: self.var_list_trainable = [ v for v in tf.trainable_variables() if "embedded" in v.name or "fc" in v.name ] else: self.var_list_trainable = [v for v in tf.trainable_variables()] with tf.name_scope("Loss"): self.loss_op = build_loss(labels=self.y, logits=self.logits, focal=self.config.focal, alpha=self.config.alpha, gamma=self.config.gamma) with tf.name_scope("Optimize"): self.adam_op = tf.train.AdamOptimizer(learning_rate=self.config.init_learning_rate). \ minimize(self.loss_op, var_list=self.var_list_trainable) self.sgd_op = tf.train.MomentumOptimizer(learning_rate=self.config.init_learning_rate, momentum=0.9). \ minimize(self.loss_op, var_list=self.var_list_trainable) with tf.name_scope("Prediction"): self.predicted = tf.cast(tf.greater_equal( self.pos_prob, self.config.threshold), dtype=tf.int32) with tf.name_scope("Summary"): self.summaries = build_summaries()