def _init_graph(self): self._init_placeholders() seq_len = tf.shape(self.input_ids_ph)[-1] self.y_st = tf.one_hot(self.y_st_ph, depth=seq_len) self.y_end = tf.one_hot(self.y_end_ph, depth=seq_len) self.bert = BertModel(config=self.bert_config, is_training=self.is_train_ph, input_ids=self.input_ids_ph, input_mask=self.input_masks_ph, token_type_ids=self.token_types_ph, use_one_hot_embeddings=False, ) last_layer = self.bert.get_sequence_output() hidden_size = last_layer.get_shape().as_list()[-1] bs = tf.shape(last_layer)[0] with tf.variable_scope('squad'): output_weights = tf.get_variable('output_weights', [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable('output_bias', [2], initializer=tf.zeros_initializer()) last_layer_rs = tf.reshape(last_layer, [-1, hidden_size]) logits = tf.matmul(last_layer_rs, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [bs, -1, 2]) logits = tf.transpose(logits, [2, 0, 1]) logits_st, logits_end = tf.unstack(logits, axis=0) logit_mask = self.token_types_ph # [CLS] token is used as no answer mask = tf.concat([tf.ones((bs, 1), dtype=tf.int32), tf.zeros((bs, seq_len-1), dtype=tf.int32)], axis=-1) logit_mask = logit_mask + mask logits_st = softmax_mask(logits_st, logit_mask) logits_end = softmax_mask(logits_end, logit_mask) start_probs = tf.nn.softmax(logits_st) end_probs = tf.nn.softmax(logits_end) outer = tf.matmul(tf.expand_dims(start_probs, axis=2), tf.expand_dims(end_probs, axis=1)) outer_logits = tf.exp(tf.expand_dims(logits_st, axis=2) + tf.expand_dims(logits_end, axis=1)) context_max_len = tf.reduce_max(tf.reduce_sum(self.token_types_ph, axis=1)) max_ans_length = tf.cast(tf.minimum(20, context_max_len), tf.int64) outer = tf.matrix_band_part(outer, 0, max_ans_length) outer_logits = tf.matrix_band_part(outer_logits, 0, max_ans_length) self.yp_score = 1 - tf.nn.softmax(logits_st)[:, 0] * tf.nn.softmax(logits_end)[:, 0] self.start_probs = start_probs self.end_probs = end_probs self.start_pred = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.end_pred = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) self.yp_logits = tf.reduce_max(tf.reduce_max(outer_logits, axis=2), axis=1) with tf.variable_scope("loss"): loss_st = tf.nn.softmax_cross_entropy_with_logits(logits=logits_st, labels=self.y_st) loss_end = tf.nn.softmax_cross_entropy_with_logits(logits=logits_end, labels=self.y_end) self.loss = tf.reduce_mean(loss_st + loss_end)
def __init__(self, n_classes: int = 2, dropout_keep_prob: float = 0.5, return_probas: bool = False, **kwargs): """ Args: n_classes: number of classes for classification dropout_keep_prob: Probability of keeping the hidden state, values from 0 to 1. 0.5 works well in most cases. return_probas: whether to return confidences of the relation to be appropriate or not **kwargs: """ kwargs.setdefault('learning_rate_drop_div', 10.0) kwargs.setdefault('learning_rate_drop_patience', 5.0) kwargs.setdefault('clip_norm', 5.0) super().__init__(**kwargs) self.n_classes = n_classes self.dropout_keep_prob = dropout_keep_prob self.return_probas = return_probas config = tf.ConfigProto() config.gpu_options.allow_growth = True if check_gpu_existence(): self.GRU = CudnnGRU else: self.GRU = CudnnCompatibleGRU self.question_ph = tf.placeholder(tf.float32, [None, None, 300]) self.rel_emb_ph = tf.placeholder(tf.float32, [None, None, 300]) r_mask_2 = tf.cast(self.rel_emb_ph, tf.bool) r_len_2 = tf.reduce_sum(tf.cast(r_mask_2, tf.int32), axis=2) r_mask = tf.cast(r_len_2, tf.bool) r_len = tf.reduce_sum(tf.cast(r_mask, tf.int32), axis=1) rel_emb = tf.math.divide_no_nan(tf.reduce_sum(self.rel_emb_ph, axis=1), tf.cast(tf.expand_dims(r_len, axis=1), tf.float32)) self.y_ph = tf.placeholder(tf.int32, shape=(None,)) self.one_hot_labels = tf.one_hot(self.y_ph, depth=self.n_classes, dtype=tf.float32) self.keep_prob_ph = tf.placeholder_with_default(1.0, shape=[], name='keep_prob_ph') q_mask_2 = tf.cast(self.question_ph, tf.bool) q_len_2 = tf.reduce_sum(tf.cast(q_mask_2, tf.int32), axis=2) q_mask = tf.cast(q_len_2, tf.bool) q_len = tf.reduce_sum(tf.cast(q_mask, tf.int32), axis=1) question_dr = variational_dropout(self.question_ph, keep_prob=self.keep_prob_ph) b_size = tf.shape(self.question_ph)[0] with tf.variable_scope("question_encode"): rnn = self.GRU(num_layers=2, num_units=75, batch_size=b_size, input_size=300, keep_prob=self.keep_prob_ph) q = rnn(question_dr, seq_len=q_len) with tf.variable_scope("attention"): rel_emb_exp = tf.expand_dims(rel_emb, axis=1) dot_products = tf.reduce_sum(tf.multiply(q, rel_emb_exp), axis=2, keep_dims=False) s_mask = softmax_mask(dot_products, q_mask) att_weights = tf.expand_dims(tf.nn.softmax(s_mask), axis=2) self.s_r = tf.reduce_sum(tf.multiply(att_weights, q), axis=1) self.logits = tf.layers.dense(tf.multiply(self.s_r, rel_emb), 2, activation=None, use_bias=False) self.y_pred = tf.argmax(self.logits, axis=-1) loss_tensor = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.one_hot_labels, logits=self.logits) self.loss = tf.reduce_mean(loss_tensor) self.train_op = self.get_train_op(self.loss) self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) self.load()