def _build_rnn(self, units, n_hidden_list, cell_type, intra_layer_dropout): for n, n_hidden in enumerate(n_hidden_list): units, _ = bi_rnn(units, n_hidden, cell_type=cell_type, name='Layer_' + str(n)) units = tf.concat(units, -1) if intra_layer_dropout and n != len(n_hidden_list) - 1: units = variational_dropout(units, self._dropout_ph) return units
def _build_rnn(self, units, n_hidden_list, cell_type, intra_layer_dropout, mask): sequence_lengths = tf.to_int32(tf.reduce_sum(mask, axis=1)) for n, n_hidden in enumerate(n_hidden_list): units, _ = bi_rnn(units, n_hidden, cell_type=cell_type, seq_lengths=sequence_lengths, name='Layer_' + str(n)) units = tf.concat(units, -1) if intra_layer_dropout and n != len(n_hidden_list) - 1: units = variational_dropout(units, self._dropout_ph) return units
def _build_top(self, units, n_tags, n_hididden, top_dropout, two_dense_on_top): if top_dropout: units = variational_dropout(units, self._dropout_ph) if two_dense_on_top: units = tf.layers.dense(units, n_hididden, activation=tf.nn.relu, kernel_initializer=INITIALIZER(), kernel_regularizer=tf.nn.l2_loss) logits = tf.layers.dense(units, n_tags, activation=None, kernel_initializer=INITIALIZER(), kernel_regularizer=tf.nn.l2_loss) return logits
def _build_cudnn_rnn(self, units, n_hidden_list, cell_type, intra_layer_dropout, mask): sequence_lengths = tf.to_int32(tf.reduce_sum(mask, axis=1)) for n, n_hidden in enumerate(n_hidden_list): with tf.variable_scope(cell_type.upper() + '_' + str(n)): if cell_type.lower() == 'lstm': units, _ = cudnn_bi_lstm(units, n_hidden, sequence_lengths) elif cell_type.lower() == 'gru': units, _ = cudnn_bi_gru(units, n_hidden, sequence_lengths) else: raise RuntimeError('Wrong cell type "{}"! Only "gru" and "lstm"!'.format(cell_type)) units = tf.concat(units, -1) if intra_layer_dropout and n != len(n_hidden_list) - 1: units = variational_dropout(units, self._dropout_ph) return units
def _build_body(self) -> Tuple[tf.Tensor, tf.Tensor]: # input projection _units = tf.layers.dense(self._features, self.dense_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav()) if self.attention_params: _attn_output = self._build_attn_body() _units = tf.concat([_units, _attn_output], -1) _units = tf_layers.variational_dropout( _units, keep_prob=self._dropout_keep_prob) # recurrent network unit _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size) _utter_lengths = tf.cast(tf.reduce_sum(self._utterance_mask, axis=-1), tf.int32) # _output: [batch_size, max_time, hidden_size] # _state: tuple of two [batch_size, hidden_size] _output, _state = tf.nn.dynamic_rnn(_lstm_cell, _units, time_major=False, initial_state=self._initial_state, sequence_length=_utter_lengths) _output = tf.reshape(_output, (self._batch_size, -1, self.hidden_size)) _output = tf_layers.variational_dropout( _output, keep_prob=self._dropout_keep_prob) # output projection _logits = tf.layers.dense(_output, self.action_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav(), name='logits') return _logits, _state
def __init__(self, n_classes: int = 2, dropout_keep_prob: float = 0.5, return_probas: bool = False, **kwargs): """ Args: n_classes: number of classes for classification dropout_keep_prob: Probability of keeping the hidden state, values from 0 to 1. 0.5 works well in most cases. return_probas: whether to return confidences of the relation to be appropriate or not **kwargs: """ kwargs.setdefault('learning_rate_drop_div', 10.0) kwargs.setdefault('learning_rate_drop_patience', 5.0) kwargs.setdefault('clip_norm', 5.0) super().__init__(**kwargs) self.n_classes = n_classes self.dropout_keep_prob = dropout_keep_prob self.return_probas = return_probas config = tf.ConfigProto() config.gpu_options.allow_growth = True if check_gpu_existence(): self.GRU = CudnnGRU else: self.GRU = CudnnCompatibleGRU self.question_ph = tf.placeholder(tf.float32, [None, None, 300]) self.rel_emb_ph = tf.placeholder(tf.float32, [None, None, 300]) r_mask_2 = tf.cast(self.rel_emb_ph, tf.bool) r_len_2 = tf.reduce_sum(tf.cast(r_mask_2, tf.int32), axis=2) r_mask = tf.cast(r_len_2, tf.bool) r_len = tf.reduce_sum(tf.cast(r_mask, tf.int32), axis=1) rel_emb = tf.math.divide_no_nan(tf.reduce_sum(self.rel_emb_ph, axis=1), tf.cast(tf.expand_dims(r_len, axis=1), tf.float32)) self.y_ph = tf.placeholder(tf.int32, shape=(None,)) self.one_hot_labels = tf.one_hot(self.y_ph, depth=self.n_classes, dtype=tf.float32) self.keep_prob_ph = tf.placeholder_with_default(1.0, shape=[], name='keep_prob_ph') q_mask_2 = tf.cast(self.question_ph, tf.bool) q_len_2 = tf.reduce_sum(tf.cast(q_mask_2, tf.int32), axis=2) q_mask = tf.cast(q_len_2, tf.bool) q_len = tf.reduce_sum(tf.cast(q_mask, tf.int32), axis=1) question_dr = variational_dropout(self.question_ph, keep_prob=self.keep_prob_ph) b_size = tf.shape(self.question_ph)[0] with tf.variable_scope("question_encode"): rnn = self.GRU(num_layers=2, num_units=75, batch_size=b_size, input_size=300, keep_prob=self.keep_prob_ph) q = rnn(question_dr, seq_len=q_len) with tf.variable_scope("attention"): rel_emb_exp = tf.expand_dims(rel_emb, axis=1) dot_products = tf.reduce_sum(tf.multiply(q, rel_emb_exp), axis=2, keep_dims=False) s_mask = softmax_mask(dot_products, q_mask) att_weights = tf.expand_dims(tf.nn.softmax(s_mask), axis=2) self.s_r = tf.reduce_sum(tf.multiply(att_weights, q), axis=1) self.logits = tf.layers.dense(tf.multiply(self.s_r, rel_emb), 2, activation=None, use_bias=False) self.y_pred = tf.argmax(self.logits, axis=-1) loss_tensor = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.one_hot_labels, logits=self.logits) self.loss = tf.reduce_mean(loss_tensor) self.train_op = self.get_train_op(self.loss) self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer()) self.load()
def _init_graph(self): self._init_placeholders() self.word_emb = tf.get_variable("word_emb", initializer=tf.constant( self.init_word_emb, dtype=tf.float32), trainable=False) self.char_emb = tf.get_variable("char_emb", initializer=tf.constant( self.init_char_emb, dtype=tf.float32), trainable=self.opt['train_char_emb']) self.c_mask = tf.cast(self.c_ph, tf.bool) self.q_mask = tf.cast(self.q_ph, tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) bs = tf.shape(self.c_ph)[0] self.c_maxlen = tf.reduce_max(self.c_len) self.q_maxlen = tf.reduce_max(self.q_len) self.c = tf.slice(self.c_ph, [0, 0], [bs, self.c_maxlen]) self.q = tf.slice(self.q_ph, [0, 0], [bs, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [bs, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [bs, self.q_maxlen]) self.cc = tf.slice(self.cc_ph, [0, 0, 0], [bs, self.c_maxlen, self.char_limit]) self.qc = tf.slice(self.qc_ph, [0, 0, 0], [bs, self.q_maxlen, self.char_limit]) self.cc_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.cc, tf.bool), tf.int32), axis=2), [-1]) self.qc_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qc, tf.bool), tf.int32), axis=2), [-1]) self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit) self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit) self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen]) self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen]) with tf.variable_scope("emb"): with tf.variable_scope("char"): cc_emb = tf.reshape( tf.nn.embedding_lookup(self.char_emb, self.cc), [bs * self.c_maxlen, self.char_limit, self.char_emb_dim]) qc_emb = tf.reshape( tf.nn.embedding_lookup(self.char_emb, self.qc), [bs * self.q_maxlen, self.char_limit, self.char_emb_dim]) cc_emb = variational_dropout(cc_emb, keep_prob=self.keep_prob_ph) qc_emb = variational_dropout(qc_emb, keep_prob=self.keep_prob_ph) _, (state_fw, state_bw) = cudnn_bi_gru(cc_emb, self.char_hidden_size, seq_lengths=self.cc_len, trainable_initial_states=True) cc_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = cudnn_bi_gru(qc_emb, self.char_hidden_size, seq_lengths=self.qc_len, trainable_initial_states=True, reuse=True) qc_emb = tf.concat([state_fw, state_bw], axis=1) cc_emb = tf.reshape( cc_emb, [bs, self.c_maxlen, 2 * self.char_hidden_size]) qc_emb = tf.reshape( qc_emb, [bs, self.q_maxlen, 2 * self.char_hidden_size]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_emb, self.c) q_emb = tf.nn.embedding_lookup(self.word_emb, self.q) c_emb = tf.concat([c_emb, cc_emb], axis=2) q_emb = tf.concat([q_emb, qc_emb], axis=2) with tf.variable_scope("encoding"): rnn = CudnnGRU(num_layers=3, num_units=self.hidden_size, batch_size=bs, input_size=c_emb.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, att_size=self.attention_hidden_size, keep_prob=self.keep_prob_ph) rnn = CudnnGRU(num_layers=1, num_units=self.hidden_size, batch_size=bs, input_size=qc_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, att_size=self.attention_hidden_size, keep_prob=self.keep_prob_ph) rnn = CudnnGRU(num_layers=1, num_units=self.hidden_size, batch_size=bs, input_size=self_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = simple_attention(q, self.hidden_size, mask=self.q_mask, keep_prob=self.keep_prob_ph) pointer = PtrNet(cell_size=init.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) logits1, logits2 = pointer(init, match, self.hidden_size, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part( outer, 0, tf.cast(tf.minimum(15, self.c_maxlen), tf.int64)) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) loss_1 = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) loss_2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(loss_1 + loss_2) if self.weight_decay < 1.0: self.var_ema = tf.train.ExponentialMovingAverage(self.weight_decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.shadow_vars = [] self.global_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.shadow_vars.append(v) self.global_vars.append(var) self.assign_vars = [] for g, v in zip(self.global_vars, self.shadow_vars): self.assign_vars.append(tf.assign(g, v))
def _build_body(self): # input projection _units = tf.layers.dense(self._features, self.dense_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav()) if self.attn: attn_scope = "attention_mechanism/{}".format(self.attn.type) with tf.variable_scope(attn_scope): if self.attn.type == 'general': _attn_output = am.general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'bahdanau': _attn_output = am.bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'cs_general': _attn_output = am.cs_general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, depth=self.attn.depth, projected_align=self.attn.projected_align) elif self.attn.type == 'cs_bahdanau': _attn_output = am.cs_bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, depth=self.attn.depth, projected_align=self.attn.projected_align) elif self.attn.type == 'light_general': _attn_output = am.light_general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'light_bahdanau': _attn_output = am.light_bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) else: raise ValueError("wrong value for attention mechanism type") _units = tf.concat([_units, _attn_output], -1) _units = tf_layers.variational_dropout(_units, keep_prob=self._dropout_keep_prob) # recurrent network unit _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size) _utter_lengths = tf.to_int32(tf.reduce_sum(self._utterance_mask, axis=-1)) _output, _state = tf.nn.dynamic_rnn(_lstm_cell, _units, initial_state=self._initial_state, sequence_length=_utter_lengths) # output projection _logits = tf.layers.dense(_output, self.action_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav(), name='logits') return _logits, _state
def _init_graph(self): self._init_placeholders() self.word_emb = tf.get_variable("word_emb", initializer=tf.constant( self.init_word_emb, dtype=tf.float32), trainable=False) self.char_emb = tf.get_variable("char_emb", initializer=tf.constant( self.init_char_emb, dtype=tf.float32), trainable=self.train_char_emb) self.c_mask = tf.cast(self.c_ph, tf.bool) self.q_mask = tf.cast(self.q_ph, tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) bs = tf.shape(self.c_ph)[0] self.c_maxlen = tf.reduce_max(self.c_len) self.q_maxlen = tf.reduce_max(self.q_len) self.c = tf.slice(self.c_ph, [0, 0], [bs, self.c_maxlen]) self.q = tf.slice(self.q_ph, [0, 0], [bs, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [bs, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [bs, self.q_maxlen]) self.cc = tf.slice(self.cc_ph, [0, 0, 0], [bs, self.c_maxlen, self.char_limit]) self.qc = tf.slice(self.qc_ph, [0, 0, 0], [bs, self.q_maxlen, self.char_limit]) self.cc_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.cc, tf.bool), tf.int32), axis=2), [-1]) self.qc_len = tf.reshape( tf.reduce_sum(tf.cast(tf.cast(self.qc, tf.bool), tf.int32), axis=2), [-1]) # to remove char sequences with len equal zero (padded tokens) self.cc_len = tf.maximum(tf.ones_like(self.cc_len), self.cc_len) self.qc_len = tf.maximum(tf.ones_like(self.qc_len), self.qc_len) self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit) self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit) self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen]) self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen]) if self.noans_token: # we use additional 'no answer' token to allow model not to answer on question # later we will add 'no answer' token as first token in context question-aware representation self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit + 1) self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit + 1) self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen + 1]) self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen + 1]) with tf.variable_scope("emb"): with tf.variable_scope("char"): cc_emb = tf.reshape( tf.nn.embedding_lookup(self.char_emb, self.cc), [bs * self.c_maxlen, self.char_limit, self.char_emb_dim]) qc_emb = tf.reshape( tf.nn.embedding_lookup(self.char_emb, self.qc), [bs * self.q_maxlen, self.char_limit, self.char_emb_dim]) cc_emb = variational_dropout(cc_emb, keep_prob=self.keep_prob_ph) qc_emb = variational_dropout(qc_emb, keep_prob=self.keep_prob_ph) _, (state_fw, state_bw) = cudnn_bi_gru(cc_emb, self.char_hidden_size, seq_lengths=self.cc_len, trainable_initial_states=True) cc_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = cudnn_bi_gru(qc_emb, self.char_hidden_size, seq_lengths=self.qc_len, trainable_initial_states=True, reuse=True) qc_emb = tf.concat([state_fw, state_bw], axis=1) cc_emb = tf.reshape( cc_emb, [bs, self.c_maxlen, 2 * self.char_hidden_size]) qc_emb = tf.reshape( qc_emb, [bs, self.q_maxlen, 2 * self.char_hidden_size]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_emb, self.c) q_emb = tf.nn.embedding_lookup(self.word_emb, self.q) c_emb = tf.concat([c_emb, cc_emb], axis=2) q_emb = tf.concat([q_emb, qc_emb], axis=2) with tf.variable_scope("encoding"): rnn = self.GRU(num_layers=3, num_units=self.hidden_size, batch_size=bs, input_size=c_emb.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, att_size=self.attention_hidden_size, keep_prob=self.keep_prob_ph) rnn = self.GRU(num_layers=1, num_units=self.hidden_size, batch_size=bs, input_size=qc_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, att_size=self.attention_hidden_size, keep_prob=self.keep_prob_ph) rnn = self.GRU(num_layers=1, num_units=self.hidden_size, batch_size=bs, input_size=self_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = simple_attention(q, self.hidden_size, mask=self.q_mask, keep_prob=self.keep_prob_ph) pointer = PtrNet(cell_size=init.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) if self.noans_token: noans_token = tf.Variable( tf.random_uniform((match.get_shape().as_list()[-1], ), -0.1, 0.1), tf.float32) noans_token = tf.nn.dropout(noans_token, keep_prob=self.keep_prob_ph) noans_token = tf.expand_dims(tf.tile( tf.expand_dims(noans_token, axis=0), [bs, 1]), axis=1) match = tf.concat([noans_token, match], axis=1) self.c_mask = tf.concat( [tf.ones(shape=(bs, 1), dtype=tf.bool), self.c_mask], axis=1) logits1, logits2 = pointer(init, match, self.hidden_size, self.c_mask) with tf.variable_scope("predict"): max_ans_length = tf.cast(tf.minimum(15, self.c_maxlen), tf.int64) outer_logits = tf.exp( tf.expand_dims(logits1, axis=2) + tf.expand_dims(logits2, axis=1)) outer_logits = tf.matrix_band_part(outer_logits, 0, max_ans_length) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, max_ans_length) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) self.yp_logits = tf.reduce_max(tf.reduce_max(outer_logits, axis=2), axis=1) if self.noans_token: self.yp_score = 1 - tf.nn.softmax( logits1)[:, 0] * tf.nn.softmax(logits2)[:, 0] loss_1 = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) loss_2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(loss_1 + loss_2)
def __init__(self, n_tags, # Features dimensions token_emb_dim=None, char_emb_dim=None, capitalization_dim=None, pos_features_dim=None, additional_features=None, net_type='rnn', # Net architecture cell_type='lstm', use_cudnn_rnn=False, two_dense_on_top=False, n_hidden_list=(128,), cnn_filter_width=7, use_crf=False, token_emb_mat=None, char_emb_mat=None, use_batch_norm=False, dropout_keep_prob=0.5, # Regularization embeddings_dropout=False, top_dropout=False, intra_layer_dropout=False, l2_reg=0.0, clip_grad_norm=5.0, learning_rate=3e-3, gpu=None, seed=None, lr_drop_patience=5, lr_drop_value=0.1, **kwargs): tf.set_random_seed(seed) np.random.seed(seed) self._learning_rate = learning_rate self._lr_drop_patience = lr_drop_patience self._lr_drop_value = lr_drop_value self._add_training_placeholders(dropout_keep_prob, learning_rate) self._xs_ph_list = [] self._y_ph = tf.placeholder(tf.int32, [None, None], name='y_ph') self._input_features = [] # ================ Building input features ================= # Token embeddings self._add_word_embeddings(token_emb_mat, token_emb_dim) # Masks for different lengths utterances self.mask_ph = self._add_mask() # Char embeddings using highway CNN with max pooling if char_emb_mat is not None and char_emb_dim is not None: self._add_char_embeddings(char_emb_mat) # Capitalization features if capitalization_dim is not None: self._add_capitalization(capitalization_dim) # Part of speech features if pos_features_dim is not None: self._add_pos(pos_features_dim) # Anything you want if additional_features is not None: self._add_additional_features(additional_features) features = tf.concat(self._input_features, axis=2) if embeddings_dropout: features = variational_dropout(features, self._dropout_ph) # ================== Building the network ================== if net_type == 'rnn': if use_cudnn_rnn: if l2_reg > 0: raise Warning('cuDNN RNN are not l2 regularizable') units = self._build_cudnn_rnn(features, n_hidden_list, cell_type, intra_layer_dropout, self.mask_ph) else: units = self._build_rnn(features, n_hidden_list, cell_type, intra_layer_dropout, self.mask_ph,) elif net_type == 'cnn': units = self._build_cnn(features, n_hidden_list, cnn_filter_width, use_batch_norm) self._logits = self._build_top(units, n_tags, n_hidden_list[-1], top_dropout, two_dense_on_top) self.train_op, self.loss = self._build_train_predict(self._logits, self.mask_ph, n_tags, use_crf, clip_grad_norm, l2_reg) self.predict = self.predict_crf if use_crf else self.predict_no_crf # ================= Initialize the session ================= sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True if gpu is not None: sess_config.gpu_options.visible_device_list = str(gpu) self.sess = tf.Session() # TODO: add sess_config self.sess.run(tf.global_variables_initializer()) super().__init__(**kwargs) self.load()
def _init_graph(self): self._init_placeholders() self.word_emb = tf.get_variable("word_emb", initializer=tf.constant(self.init_word_emb, dtype=tf.float32), trainable=False) self.char_emb = tf.get_variable("char_emb", initializer=tf.constant(self.init_char_emb, dtype=tf.float32), trainable=self.train_char_emb) self.c_mask = tf.cast(self.c_ph, tf.bool) self.q_mask = tf.cast(self.q_ph, tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) bs = tf.shape(self.c_ph)[0] self.c_maxlen = tf.reduce_max(self.c_len) self.q_maxlen = tf.reduce_max(self.q_len) self.c = tf.slice(self.c_ph, [0, 0], [bs, self.c_maxlen]) self.q = tf.slice(self.q_ph, [0, 0], [bs, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [bs, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [bs, self.q_maxlen]) self.cc = tf.slice(self.cc_ph, [0, 0, 0], [bs, self.c_maxlen, self.char_limit]) self.qc = tf.slice(self.qc_ph, [0, 0, 0], [bs, self.q_maxlen, self.char_limit]) self.cc_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.cc, tf.bool), tf.int32), axis=2), [-1]) self.qc_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.qc, tf.bool), tf.int32), axis=2), [-1]) self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit) self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit) self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen]) self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen]) with tf.variable_scope("emb"): with tf.variable_scope("char"): cc_emb = tf.reshape(tf.nn.embedding_lookup(self.char_emb, self.cc), [bs * self.c_maxlen, self.char_limit, self.char_emb_dim]) qc_emb = tf.reshape(tf.nn.embedding_lookup(self.char_emb, self.qc), [bs * self.q_maxlen, self.char_limit, self.char_emb_dim]) cc_emb = variational_dropout(cc_emb, keep_prob=self.keep_prob_ph) qc_emb = variational_dropout(qc_emb, keep_prob=self.keep_prob_ph) _, (state_fw, state_bw) = cudnn_bi_gru(cc_emb, self.char_hidden_size, seq_lengths=self.cc_len, trainable_initial_states=True) cc_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = cudnn_bi_gru(qc_emb, self.char_hidden_size, seq_lengths=self.qc_len, trainable_initial_states=True, reuse=True) qc_emb = tf.concat([state_fw, state_bw], axis=1) cc_emb = tf.reshape(cc_emb, [bs, self.c_maxlen, 2 * self.char_hidden_size]) qc_emb = tf.reshape(qc_emb, [bs, self.q_maxlen, 2 * self.char_hidden_size]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_emb, self.c) q_emb = tf.nn.embedding_lookup(self.word_emb, self.q) c_emb = tf.concat([c_emb, cc_emb], axis=2) q_emb = tf.concat([q_emb, qc_emb], axis=2) with tf.variable_scope("encoding"): rnn = self.GRU(num_layers=3, num_units=self.hidden_size, batch_size=bs, input_size=c_emb.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, att_size=self.attention_hidden_size, keep_prob=self.keep_prob_ph) rnn = self.GRU(num_layers=1, num_units=self.hidden_size, batch_size=bs, input_size=qc_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, att_size=self.attention_hidden_size, keep_prob=self.keep_prob_ph) rnn = self.GRU(num_layers=1, num_units=self.hidden_size, batch_size=bs, input_size=self_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = simple_attention(q, self.hidden_size, mask=self.q_mask, keep_prob=self.keep_prob_ph) pointer = PtrNet(cell_size=init.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) logits1, logits2 = pointer(init, match, self.hidden_size, self.c_mask) with tf.variable_scope("predict"): outer_logits = tf.exp(tf.expand_dims(logits1, axis=2) + tf.expand_dims(logits2, axis=1)) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, tf.cast(tf.minimum(15, self.c_maxlen), tf.int64)) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) self.yp_logits = tf.reduce_max(tf.reduce_max(outer_logits, axis=2), axis=1) loss_1 = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) loss_2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(loss_1 + loss_2) if self.weight_decay < 1.0: self.var_ema = tf.train.ExponentialMovingAverage(self.weight_decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.shadow_vars = [] self.global_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.shadow_vars.append(v) self.global_vars.append(var) self.assign_vars = [] for g, v in zip(self.global_vars, self.shadow_vars): self.assign_vars.append(tf.assign(g, v))
def _build_body(self): # input projection _units = tf.layers.dense(self._features, self.dense_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav()) if self.attn: attn_scope = "attention_mechanism/{}".format(self.attn.type) with tf.variable_scope(attn_scope): if self.attn.type == 'general': _attn_output = am.general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'bahdanau': _attn_output = am.bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'cs_general': _attn_output = am.cs_general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, depth=self.attn.depth, projected_align=self.attn.projected_align) elif self.attn.type == 'cs_bahdanau': _attn_output = am.cs_bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, depth=self.attn.depth, projected_align=self.attn.projected_align) elif self.attn.type == 'light_general': _attn_output = am.light_general_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) elif self.attn.type == 'light_bahdanau': _attn_output = am.light_bahdanau_attention( self._key, self._emb_context, hidden_size=self.attn.hidden_size, projected_align=self.attn.projected_align) else: raise ValueError("wrong value for attention mechanism type") _units = tf.concat([_units, _attn_output], -1) _units = tf_layers.variational_dropout(_units, keep_prob=self._dropout_keep_prob) # recurrent network unit _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size) _utter_lengths = tf.to_int32(tf.reduce_sum(self._utterance_mask, axis=-1)) _output, _state = tf.nn.dynamic_rnn(_lstm_cell, _units, time_major=False, initial_state=self._initial_state, sequence_length=_utter_lengths) _output = tf.reshape(_output, (self._batch_size, -1, self.hidden_size)) _output = tf_layers.variational_dropout(_output, keep_prob=self._dropout_keep_prob) # output projection _logits = tf.layers.dense(_output, self.action_size, kernel_regularizer=tf.nn.l2_loss, kernel_initializer=xav(), name='logits') return _logits, _state
def _init_graph(self): self._init_placeholders() self.word_emb = tf.get_variable("word_emb", initializer=tf.constant(self.init_word_emb, dtype=tf.float32), trainable=False) self.char_emb = tf.get_variable("char_emb", initializer=tf.constant(self.init_char_emb, dtype=tf.float32), trainable=self.train_char_emb) self.c_mask = tf.cast(self.c_ph, tf.bool) self.q_mask = tf.cast(self.q_ph, tf.bool) self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1) self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) bs = tf.shape(self.c_ph)[0] self.c_maxlen = tf.reduce_max(self.c_len) self.q_maxlen = tf.reduce_max(self.q_len) self.c = tf.slice(self.c_ph, [0, 0], [bs, self.c_maxlen]) self.q = tf.slice(self.q_ph, [0, 0], [bs, self.q_maxlen]) self.c_mask = tf.slice(self.c_mask, [0, 0], [bs, self.c_maxlen]) self.q_mask = tf.slice(self.q_mask, [0, 0], [bs, self.q_maxlen]) self.cc = tf.slice(self.cc_ph, [0, 0, 0], [bs, self.c_maxlen, self.char_limit]) self.qc = tf.slice(self.qc_ph, [0, 0, 0], [bs, self.q_maxlen, self.char_limit]) self.cc_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.cc, tf.bool), tf.int32), axis=2), [-1]) self.qc_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.qc, tf.bool), tf.int32), axis=2), [-1]) # to remove char sequences with len equal zero (padded tokens) self.cc_len = tf.maximum(tf.ones_like(self.cc_len), self.cc_len) self.qc_len = tf.maximum(tf.ones_like(self.qc_len), self.qc_len) self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit) self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit) self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen]) self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen]) if self.noans_token: # we use additional 'no answer' token to allow model not to answer on question # later we will add 'no answer' token as first token in context question-aware representation self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit + 1) self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit + 1) self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen + 1]) self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen + 1]) with tf.variable_scope("emb"): with tf.variable_scope("char"): cc_emb = tf.reshape(tf.nn.embedding_lookup(self.char_emb, self.cc), [bs * self.c_maxlen, self.char_limit, self.char_emb_dim]) qc_emb = tf.reshape(tf.nn.embedding_lookup(self.char_emb, self.qc), [bs * self.q_maxlen, self.char_limit, self.char_emb_dim]) cc_emb = variational_dropout(cc_emb, keep_prob=self.keep_prob_ph) qc_emb = variational_dropout(qc_emb, keep_prob=self.keep_prob_ph) _, (state_fw, state_bw) = cudnn_bi_gru(cc_emb, self.char_hidden_size, seq_lengths=self.cc_len, trainable_initial_states=True) cc_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = cudnn_bi_gru(qc_emb, self.char_hidden_size, seq_lengths=self.qc_len, trainable_initial_states=True, reuse=True) qc_emb = tf.concat([state_fw, state_bw], axis=1) cc_emb = tf.reshape(cc_emb, [bs, self.c_maxlen, 2 * self.char_hidden_size]) qc_emb = tf.reshape(qc_emb, [bs, self.q_maxlen, 2 * self.char_hidden_size]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_emb, self.c) q_emb = tf.nn.embedding_lookup(self.word_emb, self.q) c_emb = tf.concat([c_emb, cc_emb], axis=2) q_emb = tf.concat([q_emb, qc_emb], axis=2) with tf.variable_scope("encoding"): rnn = self.GRU(num_layers=3, num_units=self.hidden_size, batch_size=bs, input_size=c_emb.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, att_size=self.attention_hidden_size, keep_prob=self.keep_prob_ph) rnn = self.GRU(num_layers=1, num_units=self.hidden_size, batch_size=bs, input_size=qc_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, att_size=self.attention_hidden_size, keep_prob=self.keep_prob_ph) rnn = self.GRU(num_layers=1, num_units=self.hidden_size, batch_size=bs, input_size=self_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = simple_attention(q, self.hidden_size, mask=self.q_mask, keep_prob=self.keep_prob_ph) pointer = PtrNet(cell_size=init.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph) if self.noans_token: noans_token = tf.Variable(tf.random_uniform((match.get_shape().as_list()[-1],), -0.1, 0.1), tf.float32) noans_token = tf.nn.dropout(noans_token, keep_prob=self.keep_prob_ph) noans_token = tf.expand_dims(tf.tile(tf.expand_dims(noans_token, axis=0), [bs, 1]), axis=1) match = tf.concat([noans_token, match], axis=1) self.c_mask = tf.concat([tf.ones(shape=(bs, 1), dtype=tf.bool), self.c_mask], axis=1) logits1, logits2 = pointer(init, match, self.hidden_size, self.c_mask) with tf.variable_scope("predict"): max_ans_length = tf.cast(tf.minimum(15, self.c_maxlen), tf.int64) outer_logits = tf.exp(tf.expand_dims(logits1, axis=2) + tf.expand_dims(logits2, axis=1)) outer_logits = tf.matrix_band_part(outer_logits, 0, max_ans_length) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, max_ans_length) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) self.yp_logits = tf.reduce_max(tf.reduce_max(outer_logits, axis=2), axis=1) if self.noans_token: self.yp_score = 1 - tf.nn.softmax(logits1)[:, 0] * tf.nn.softmax(logits2)[:, 0] loss_1 = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) loss_2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(loss_1 + loss_2)
def __init__( self, n_tags: int, # Features dimensions token_emb_dim: int = None, char_emb_dim: int = None, capitalization_dim: int = None, pos_features_dim: int = None, additional_features: int = None, net_type: str = 'rnn', # Net architecture cell_type: str = 'lstm', use_cudnn_rnn: bool = False, two_dense_on_top: bool = False, n_hidden_list: Tuple[int] = (128, ), cnn_filter_width: int = 7, use_crf: bool = False, token_emb_mat: np.ndarray = None, char_emb_mat: np.ndarray = None, use_batch_norm: bool = False, dropout_keep_prob: float = 0.5, # Regularization embeddings_dropout: bool = False, top_dropout: bool = False, intra_layer_dropout: bool = False, l2_reg: float = 0.0, gpu: int = None, seed: int = None, **kwargs) -> None: tf.set_random_seed(seed) np.random.seed(seed) char_emb_mat = np.resize(char_emb_mat, (162, 100)) assert n_tags != 0, 'Number of classes equal 0! It seems that vocabularies is not loaded.' \ ' Check that all vocabulary files are downloaded!' if 'learning_rate_drop_div' not in kwargs: kwargs['learning_rate_drop_div'] = 10.0 if 'learning_rate_drop_patience' not in kwargs: kwargs['learning_rate_drop_patience'] = 5.0 if 'clip_norm' not in kwargs: kwargs['clip_norm'] = 5.0 super().__init__(**kwargs) self._add_training_placeholders(dropout_keep_prob) self._xs_ph_list = [] self._y_ph = tf.placeholder(tf.int32, [None, None], name='y_ph') self._input_features = [] # ================ Building input features ================= # Token embeddings self._add_word_embeddings(token_emb_mat, token_emb_dim) # Masks for different lengths utterances self.mask_ph = self._add_mask() # Char embeddings using highway CNN with max pooling if char_emb_mat is not None and char_emb_dim is not None: self._add_char_embeddings(char_emb_mat) # Capitalization features if capitalization_dim is not None: self._add_capitalization(capitalization_dim) # Part of speech features if pos_features_dim is not None: self._add_pos(pos_features_dim) # Anything you want if additional_features is not None: self._add_additional_features(additional_features) features = tf.concat(self._input_features, axis=2) if embeddings_dropout: features = variational_dropout(features, self._dropout_ph) # ================== Building the network ================== if net_type == 'rnn': if use_cudnn_rnn: if l2_reg > 0: log.warning('cuDNN RNN are not l2 regularizable') units = self._build_cudnn_rnn(features, n_hidden_list, cell_type, intra_layer_dropout, self.mask_ph) else: units = self._build_rnn(features, n_hidden_list, cell_type, intra_layer_dropout, self.mask_ph) elif net_type == 'cnn': units = self._build_cnn(features, n_hidden_list, cnn_filter_width, use_batch_norm) self._logits = self._build_top(units, n_tags, n_hidden_list[-1], top_dropout, two_dense_on_top) self.train_op, self.loss = self._build_train_predict( self._logits, self.mask_ph, n_tags, use_crf, l2_reg) self.predict = self.predict_crf if use_crf else self.predict_no_crf # ================= Initialize the session ================= sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True if gpu is not None: sess_config.gpu_options.visible_device_list = str(gpu) self.sess = tf.Session(config=sess_config) self.sess.run(tf.global_variables_initializer()) self.load()
def __init__(self, n_tags: int, # Features dimensions token_emb_dim: int = None, char_emb_dim: int = None, capitalization_dim: int = None, pos_features_dim: int = None, additional_features: int = None, net_type: str = 'rnn', # Net architecture cell_type: str = 'lstm', use_cudnn_rnn: bool = False, two_dense_on_top: bool = False, n_hidden_list: Tuple[int] = (128,), cnn_filter_width: int = 7, use_crf: bool = False, token_emb_mat: np.ndarray = None, char_emb_mat: np.ndarray = None, use_batch_norm: bool = False, dropout_keep_prob: float = 0.5, # Regularization embeddings_dropout: bool = False, top_dropout: bool = False, intra_layer_dropout: bool = False, l2_reg: float = 0.0, clip_grad_norm: float = 5.0, learning_rate: float = 3e-3, gpu: int = None, seed: int = None, lr_drop_patience: int = 5, lr_drop_value: float = 0.1, **kwargs) -> None: tf.set_random_seed(seed) np.random.seed(seed) self._learning_rate = learning_rate self._lr_drop_patience = lr_drop_patience self._lr_drop_value = lr_drop_value self._add_training_placeholders(dropout_keep_prob, learning_rate) self._xs_ph_list = [] self._y_ph = tf.placeholder(tf.int32, [None, None], name='y_ph') self._input_features = [] # ================ Building input features ================= # Token embeddings self._add_word_embeddings(token_emb_mat, token_emb_dim) # Masks for different lengths utterances self.mask_ph = self._add_mask() # Char embeddings using highway CNN with max pooling if char_emb_mat is not None and char_emb_dim is not None: self._add_char_embeddings(char_emb_mat) # Capitalization features if capitalization_dim is not None: self._add_capitalization(capitalization_dim) # Part of speech features if pos_features_dim is not None: self._add_pos(pos_features_dim) # Anything you want if additional_features is not None: self._add_additional_features(additional_features) features = tf.concat(self._input_features, axis=2) if embeddings_dropout: features = variational_dropout(features, self._dropout_ph) # ================== Building the network ================== if net_type == 'rnn': if use_cudnn_rnn: if l2_reg > 0: raise Warning('cuDNN RNN are not l2 regularizable') units = self._build_cudnn_rnn(features, n_hidden_list, cell_type, intra_layer_dropout, self.mask_ph) else: units = self._build_rnn(features, n_hidden_list, cell_type, intra_layer_dropout, self.mask_ph,) elif net_type == 'cnn': units = self._build_cnn(features, n_hidden_list, cnn_filter_width, use_batch_norm) self._logits = self._build_top(units, n_tags, n_hidden_list[-1], top_dropout, two_dense_on_top) self.train_op, self.loss = self._build_train_predict(self._logits, self.mask_ph, n_tags, use_crf, clip_grad_norm, l2_reg) self.predict = self.predict_crf if use_crf else self.predict_no_crf # ================= Initialize the session ================= sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True if gpu is not None: sess_config.gpu_options.visible_device_list = str(gpu) self.sess = tf.Session() # TODO: add sess_config self.sess.run(tf.global_variables_initializer()) super().__init__(**kwargs) self.load()