Ejemplo n.º 1
0
 def _build_rnn(self, units, n_hidden_list, cell_type, intra_layer_dropout):
     for n, n_hidden in enumerate(n_hidden_list):
         units, _ = bi_rnn(units, n_hidden, cell_type=cell_type, name='Layer_' + str(n))
         units = tf.concat(units, -1)
         if intra_layer_dropout and n != len(n_hidden_list) - 1:
             units = variational_dropout(units, self._dropout_ph)
     return units
Ejemplo n.º 2
0
 def _build_rnn(self, units, n_hidden_list, cell_type, intra_layer_dropout):
     for n, n_hidden in enumerate(n_hidden_list):
         units, _ = bi_rnn(units, n_hidden, cell_type=cell_type, name='Layer_' + str(n))
         units = tf.concat(units, -1)
         if intra_layer_dropout and n != len(n_hidden_list) - 1:
             units = variational_dropout(units, self._dropout_ph)
     return units
Ejemplo n.º 3
0
 def _build_rnn(self, units, n_hidden_list, cell_type, intra_layer_dropout, mask):
     sequence_lengths = tf.to_int32(tf.reduce_sum(mask, axis=1))
     for n, n_hidden in enumerate(n_hidden_list):
         units, _ = bi_rnn(units, n_hidden, cell_type=cell_type,
                           seq_lengths=sequence_lengths, name='Layer_' + str(n))
         units = tf.concat(units, -1)
         if intra_layer_dropout and n != len(n_hidden_list) - 1:
             units = variational_dropout(units, self._dropout_ph)
     return units
Ejemplo n.º 4
0
 def _build_rnn(self, units, n_hidden_list, cell_type, intra_layer_dropout, mask):
     sequence_lengths = tf.to_int32(tf.reduce_sum(mask, axis=1))
     for n, n_hidden in enumerate(n_hidden_list):
         units, _ = bi_rnn(units, n_hidden, cell_type=cell_type,
                           seq_lengths=sequence_lengths, name='Layer_' + str(n))
         units = tf.concat(units, -1)
         if intra_layer_dropout and n != len(n_hidden_list) - 1:
             units = variational_dropout(units, self._dropout_ph)
     return units
Ejemplo n.º 5
0
 def _build_top(self, units, n_tags, n_hididden, top_dropout, two_dense_on_top):
     if top_dropout:
         units = variational_dropout(units, self._dropout_ph)
     if two_dense_on_top:
         units = tf.layers.dense(units, n_hididden, activation=tf.nn.relu,
                                 kernel_initializer=INITIALIZER(),
                                 kernel_regularizer=tf.nn.l2_loss)
     logits = tf.layers.dense(units, n_tags, activation=None,
                              kernel_initializer=INITIALIZER(),
                              kernel_regularizer=tf.nn.l2_loss)
     return logits
Ejemplo n.º 6
0
 def _build_top(self, units, n_tags, n_hididden, top_dropout, two_dense_on_top):
     if top_dropout:
         units = variational_dropout(units, self._dropout_ph)
     if two_dense_on_top:
         units = tf.layers.dense(units, n_hididden, activation=tf.nn.relu,
                                 kernel_initializer=INITIALIZER(),
                                 kernel_regularizer=tf.nn.l2_loss)
     logits = tf.layers.dense(units, n_tags, activation=None,
                              kernel_initializer=INITIALIZER(),
                              kernel_regularizer=tf.nn.l2_loss)
     return logits
Ejemplo n.º 7
0
 def _build_cudnn_rnn(self, units, n_hidden_list, cell_type, intra_layer_dropout, mask):
     sequence_lengths = tf.to_int32(tf.reduce_sum(mask, axis=1))
     for n, n_hidden in enumerate(n_hidden_list):
         with tf.variable_scope(cell_type.upper() + '_' + str(n)):
             if cell_type.lower() == 'lstm':
                 units, _ = cudnn_bi_lstm(units, n_hidden, sequence_lengths)
             elif cell_type.lower() == 'gru':
                 units, _ = cudnn_bi_gru(units, n_hidden, sequence_lengths)
             else:
                 raise RuntimeError('Wrong cell type "{}"! Only "gru" and "lstm"!'.format(cell_type))
             units = tf.concat(units, -1)
             if intra_layer_dropout and n != len(n_hidden_list) - 1:
                 units = variational_dropout(units, self._dropout_ph)
         return units
Ejemplo n.º 8
0
 def _build_cudnn_rnn(self, units, n_hidden_list, cell_type, intra_layer_dropout, mask):
     sequence_lengths = tf.to_int32(tf.reduce_sum(mask, axis=1))
     for n, n_hidden in enumerate(n_hidden_list):
         with tf.variable_scope(cell_type.upper() + '_' + str(n)):
             if cell_type.lower() == 'lstm':
                 units, _ = cudnn_bi_lstm(units, n_hidden, sequence_lengths)
             elif cell_type.lower() == 'gru':
                 units, _ = cudnn_bi_gru(units, n_hidden, sequence_lengths)
             else:
                 raise RuntimeError('Wrong cell type "{}"! Only "gru" and "lstm"!'.format(cell_type))
             units = tf.concat(units, -1)
             if intra_layer_dropout and n != len(n_hidden_list) - 1:
                 units = variational_dropout(units, self._dropout_ph)
         return units
Ejemplo n.º 9
0
    def _build_body(self) -> Tuple[tf.Tensor, tf.Tensor]:
        # input projection
        _units = tf.layers.dense(self._features,
                                 self.dense_size,
                                 kernel_regularizer=tf.nn.l2_loss,
                                 kernel_initializer=xav())

        if self.attention_params:
            _attn_output = self._build_attn_body()
            _units = tf.concat([_units, _attn_output], -1)

        _units = tf_layers.variational_dropout(
            _units, keep_prob=self._dropout_keep_prob)

        # recurrent network unit
        _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size)
        _utter_lengths = tf.cast(tf.reduce_sum(self._utterance_mask, axis=-1),
                                 tf.int32)

        # _output: [batch_size, max_time, hidden_size]
        # _state: tuple of two [batch_size, hidden_size]
        _output, _state = tf.nn.dynamic_rnn(_lstm_cell,
                                            _units,
                                            time_major=False,
                                            initial_state=self._initial_state,
                                            sequence_length=_utter_lengths)

        _output = tf.reshape(_output, (self._batch_size, -1, self.hidden_size))
        _output = tf_layers.variational_dropout(
            _output, keep_prob=self._dropout_keep_prob)
        # output projection
        _logits = tf.layers.dense(_output,
                                  self.action_size,
                                  kernel_regularizer=tf.nn.l2_loss,
                                  kernel_initializer=xav(),
                                  name='logits')
        return _logits, _state
Ejemplo n.º 10
0
    def __init__(self, n_classes: int = 2,
                 dropout_keep_prob: float = 0.5,
                 return_probas: bool = False, **kwargs):
        """

        Args:
            n_classes: number of classes for classification
            dropout_keep_prob: Probability of keeping the hidden state, values from 0 to 1. 0.5 works well
                in most cases.
            return_probas: whether to return confidences of the relation to be appropriate or not
            **kwargs:
        """
        kwargs.setdefault('learning_rate_drop_div', 10.0)
        kwargs.setdefault('learning_rate_drop_patience', 5.0)
        kwargs.setdefault('clip_norm', 5.0)

        super().__init__(**kwargs)

        self.n_classes = n_classes
        self.dropout_keep_prob = dropout_keep_prob
        self.return_probas = return_probas
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        
        if check_gpu_existence():
            self.GRU = CudnnGRU
        else:
            self.GRU = CudnnCompatibleGRU

        self.question_ph = tf.placeholder(tf.float32, [None, None, 300])
        self.rel_emb_ph = tf.placeholder(tf.float32, [None, None, 300])

        r_mask_2 = tf.cast(self.rel_emb_ph, tf.bool)
        r_len_2 = tf.reduce_sum(tf.cast(r_mask_2, tf.int32), axis=2)
        r_mask = tf.cast(r_len_2, tf.bool)
        r_len = tf.reduce_sum(tf.cast(r_mask, tf.int32), axis=1)
        rel_emb = tf.math.divide_no_nan(tf.reduce_sum(self.rel_emb_ph, axis=1),
                                        tf.cast(tf.expand_dims(r_len, axis=1), tf.float32))

        self.y_ph = tf.placeholder(tf.int32, shape=(None,))
        self.one_hot_labels = tf.one_hot(self.y_ph, depth=self.n_classes, dtype=tf.float32)
        self.keep_prob_ph = tf.placeholder_with_default(1.0, shape=[], name='keep_prob_ph')

        q_mask_2 = tf.cast(self.question_ph, tf.bool)
        q_len_2 = tf.reduce_sum(tf.cast(q_mask_2, tf.int32), axis=2)
        q_mask = tf.cast(q_len_2, tf.bool)
        q_len = tf.reduce_sum(tf.cast(q_mask, tf.int32), axis=1)

        question_dr = variational_dropout(self.question_ph, keep_prob=self.keep_prob_ph)
        b_size = tf.shape(self.question_ph)[0]

        with tf.variable_scope("question_encode"):
            rnn = self.GRU(num_layers=2, num_units=75, batch_size=b_size, input_size=300, keep_prob=self.keep_prob_ph)
            q = rnn(question_dr, seq_len=q_len)

        with tf.variable_scope("attention"):
            rel_emb_exp = tf.expand_dims(rel_emb, axis=1)
            dot_products = tf.reduce_sum(tf.multiply(q, rel_emb_exp), axis=2, keep_dims=False)
            s_mask = softmax_mask(dot_products, q_mask)
            att_weights = tf.expand_dims(tf.nn.softmax(s_mask), axis=2)
            self.s_r = tf.reduce_sum(tf.multiply(att_weights, q), axis=1)

            self.logits = tf.layers.dense(tf.multiply(self.s_r, rel_emb), 2, activation=None, use_bias=False)
            self.y_pred = tf.argmax(self.logits, axis=-1)

            loss_tensor = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.one_hot_labels, logits=self.logits)

            self.loss = tf.reduce_mean(loss_tensor)
            self.train_op = self.get_train_op(self.loss)

        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
        self.load()
Ejemplo n.º 11
0
    def _init_graph(self):
        self._init_placeholders()

        self.word_emb = tf.get_variable("word_emb",
                                        initializer=tf.constant(
                                            self.init_word_emb,
                                            dtype=tf.float32),
                                        trainable=False)
        self.char_emb = tf.get_variable("char_emb",
                                        initializer=tf.constant(
                                            self.init_char_emb,
                                            dtype=tf.float32),
                                        trainable=self.opt['train_char_emb'])

        self.c_mask = tf.cast(self.c_ph, tf.bool)
        self.q_mask = tf.cast(self.q_ph, tf.bool)
        self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)
        self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)

        bs = tf.shape(self.c_ph)[0]
        self.c_maxlen = tf.reduce_max(self.c_len)
        self.q_maxlen = tf.reduce_max(self.q_len)
        self.c = tf.slice(self.c_ph, [0, 0], [bs, self.c_maxlen])
        self.q = tf.slice(self.q_ph, [0, 0], [bs, self.q_maxlen])
        self.c_mask = tf.slice(self.c_mask, [0, 0], [bs, self.c_maxlen])
        self.q_mask = tf.slice(self.q_mask, [0, 0], [bs, self.q_maxlen])
        self.cc = tf.slice(self.cc_ph, [0, 0, 0],
                           [bs, self.c_maxlen, self.char_limit])
        self.qc = tf.slice(self.qc_ph, [0, 0, 0],
                           [bs, self.q_maxlen, self.char_limit])
        self.cc_len = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(self.cc, tf.bool), tf.int32),
                          axis=2), [-1])
        self.qc_len = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(self.qc, tf.bool), tf.int32),
                          axis=2), [-1])
        self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit)
        self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit)
        self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen])
        self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen])

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                cc_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_emb, self.cc),
                    [bs * self.c_maxlen, self.char_limit, self.char_emb_dim])
                qc_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_emb, self.qc),
                    [bs * self.q_maxlen, self.char_limit, self.char_emb_dim])

                cc_emb = variational_dropout(cc_emb,
                                             keep_prob=self.keep_prob_ph)
                qc_emb = variational_dropout(qc_emb,
                                             keep_prob=self.keep_prob_ph)

                _, (state_fw,
                    state_bw) = cudnn_bi_gru(cc_emb,
                                             self.char_hidden_size,
                                             seq_lengths=self.cc_len,
                                             trainable_initial_states=True)
                cc_emb = tf.concat([state_fw, state_bw], axis=1)

                _, (state_fw,
                    state_bw) = cudnn_bi_gru(qc_emb,
                                             self.char_hidden_size,
                                             seq_lengths=self.qc_len,
                                             trainable_initial_states=True,
                                             reuse=True)
                qc_emb = tf.concat([state_fw, state_bw], axis=1)

                cc_emb = tf.reshape(
                    cc_emb, [bs, self.c_maxlen, 2 * self.char_hidden_size])
                qc_emb = tf.reshape(
                    qc_emb, [bs, self.q_maxlen, 2 * self.char_hidden_size])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_emb, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_emb, self.q)

            c_emb = tf.concat([c_emb, cc_emb], axis=2)
            q_emb = tf.concat([q_emb, qc_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = CudnnGRU(num_layers=3,
                           num_units=self.hidden_size,
                           batch_size=bs,
                           input_size=c_emb.get_shape().as_list()[-1],
                           keep_prob=self.keep_prob_ph)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c,
                                   q,
                                   mask=self.q_mask,
                                   att_size=self.attention_hidden_size,
                                   keep_prob=self.keep_prob_ph)
            rnn = CudnnGRU(num_layers=1,
                           num_units=self.hidden_size,
                           batch_size=bs,
                           input_size=qc_att.get_shape().as_list()[-1],
                           keep_prob=self.keep_prob_ph)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(att,
                                     att,
                                     mask=self.c_mask,
                                     att_size=self.attention_hidden_size,
                                     keep_prob=self.keep_prob_ph)
            rnn = CudnnGRU(num_layers=1,
                           num_units=self.hidden_size,
                           batch_size=bs,
                           input_size=self_att.get_shape().as_list()[-1],
                           keep_prob=self.keep_prob_ph)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = simple_attention(q,
                                    self.hidden_size,
                                    mask=self.q_mask,
                                    keep_prob=self.keep_prob_ph)
            pointer = PtrNet(cell_size=init.get_shape().as_list()[-1],
                             keep_prob=self.keep_prob_ph)
            logits1, logits2 = pointer(init, match, self.hidden_size,
                                       self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(
                outer, 0, tf.cast(tf.minimum(15, self.c_maxlen), tf.int64))
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            loss_1 = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            loss_2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                             labels=self.y2)
            self.loss = tf.reduce_mean(loss_1 + loss_2)

        if self.weight_decay < 1.0:
            self.var_ema = tf.train.ExponentialMovingAverage(self.weight_decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.shadow_vars = []
                self.global_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.shadow_vars.append(v)
                        self.global_vars.append(var)
                self.assign_vars = []
                for g, v in zip(self.global_vars, self.shadow_vars):
                    self.assign_vars.append(tf.assign(g, v))
Ejemplo n.º 12
0
    def _build_body(self):
        # input projection
        _units = tf.layers.dense(self._features, self.dense_size,
                                 kernel_regularizer=tf.nn.l2_loss,
                                 kernel_initializer=xav())
        if self.attn:
            attn_scope = "attention_mechanism/{}".format(self.attn.type)
            with tf.variable_scope(attn_scope):
                if self.attn.type == 'general':
                    _attn_output = am.general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'bahdanau':
                    _attn_output = am.bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'cs_general':
                    _attn_output = am.cs_general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        depth=self.attn.depth,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'cs_bahdanau':
                    _attn_output = am.cs_bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        depth=self.attn.depth,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'light_general':
                    _attn_output = am.light_general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'light_bahdanau':
                    _attn_output = am.light_bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                else:
                    raise ValueError("wrong value for attention mechanism type")
            _units = tf.concat([_units, _attn_output], -1)

        _units = tf_layers.variational_dropout(_units,
                                               keep_prob=self._dropout_keep_prob)

        # recurrent network unit
        _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size)
        _utter_lengths = tf.to_int32(tf.reduce_sum(self._utterance_mask, axis=-1))
        _output, _state = tf.nn.dynamic_rnn(_lstm_cell,
                                            _units,
                                            initial_state=self._initial_state,
                                            sequence_length=_utter_lengths)

        # output projection
        _logits = tf.layers.dense(_output, self.action_size,
                                  kernel_regularizer=tf.nn.l2_loss,
                                  kernel_initializer=xav(), name='logits')
        return _logits, _state
Ejemplo n.º 13
0
    def _init_graph(self):
        self._init_placeholders()

        self.word_emb = tf.get_variable("word_emb",
                                        initializer=tf.constant(
                                            self.init_word_emb,
                                            dtype=tf.float32),
                                        trainable=False)
        self.char_emb = tf.get_variable("char_emb",
                                        initializer=tf.constant(
                                            self.init_char_emb,
                                            dtype=tf.float32),
                                        trainable=self.train_char_emb)

        self.c_mask = tf.cast(self.c_ph, tf.bool)
        self.q_mask = tf.cast(self.q_ph, tf.bool)
        self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)
        self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)

        bs = tf.shape(self.c_ph)[0]
        self.c_maxlen = tf.reduce_max(self.c_len)
        self.q_maxlen = tf.reduce_max(self.q_len)
        self.c = tf.slice(self.c_ph, [0, 0], [bs, self.c_maxlen])
        self.q = tf.slice(self.q_ph, [0, 0], [bs, self.q_maxlen])
        self.c_mask = tf.slice(self.c_mask, [0, 0], [bs, self.c_maxlen])
        self.q_mask = tf.slice(self.q_mask, [0, 0], [bs, self.q_maxlen])
        self.cc = tf.slice(self.cc_ph, [0, 0, 0],
                           [bs, self.c_maxlen, self.char_limit])
        self.qc = tf.slice(self.qc_ph, [0, 0, 0],
                           [bs, self.q_maxlen, self.char_limit])
        self.cc_len = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(self.cc, tf.bool), tf.int32),
                          axis=2), [-1])
        self.qc_len = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(self.qc, tf.bool), tf.int32),
                          axis=2), [-1])
        # to remove char sequences with len equal zero (padded tokens)
        self.cc_len = tf.maximum(tf.ones_like(self.cc_len), self.cc_len)
        self.qc_len = tf.maximum(tf.ones_like(self.qc_len), self.qc_len)
        self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit)
        self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit)
        self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen])
        self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen])

        if self.noans_token:
            # we use additional 'no answer' token to allow model not to answer on question
            # later we will add 'no answer' token as first token in context question-aware representation
            self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit + 1)
            self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit + 1)
            self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen + 1])
            self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen + 1])

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                cc_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_emb, self.cc),
                    [bs * self.c_maxlen, self.char_limit, self.char_emb_dim])
                qc_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_emb, self.qc),
                    [bs * self.q_maxlen, self.char_limit, self.char_emb_dim])

                cc_emb = variational_dropout(cc_emb,
                                             keep_prob=self.keep_prob_ph)
                qc_emb = variational_dropout(qc_emb,
                                             keep_prob=self.keep_prob_ph)

                _, (state_fw,
                    state_bw) = cudnn_bi_gru(cc_emb,
                                             self.char_hidden_size,
                                             seq_lengths=self.cc_len,
                                             trainable_initial_states=True)
                cc_emb = tf.concat([state_fw, state_bw], axis=1)

                _, (state_fw,
                    state_bw) = cudnn_bi_gru(qc_emb,
                                             self.char_hidden_size,
                                             seq_lengths=self.qc_len,
                                             trainable_initial_states=True,
                                             reuse=True)
                qc_emb = tf.concat([state_fw, state_bw], axis=1)

                cc_emb = tf.reshape(
                    cc_emb, [bs, self.c_maxlen, 2 * self.char_hidden_size])
                qc_emb = tf.reshape(
                    qc_emb, [bs, self.q_maxlen, 2 * self.char_hidden_size])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_emb, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_emb, self.q)

            c_emb = tf.concat([c_emb, cc_emb], axis=2)
            q_emb = tf.concat([q_emb, qc_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = self.GRU(num_layers=3,
                           num_units=self.hidden_size,
                           batch_size=bs,
                           input_size=c_emb.get_shape().as_list()[-1],
                           keep_prob=self.keep_prob_ph)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c,
                                   q,
                                   mask=self.q_mask,
                                   att_size=self.attention_hidden_size,
                                   keep_prob=self.keep_prob_ph)
            rnn = self.GRU(num_layers=1,
                           num_units=self.hidden_size,
                           batch_size=bs,
                           input_size=qc_att.get_shape().as_list()[-1],
                           keep_prob=self.keep_prob_ph)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(att,
                                     att,
                                     mask=self.c_mask,
                                     att_size=self.attention_hidden_size,
                                     keep_prob=self.keep_prob_ph)
            rnn = self.GRU(num_layers=1,
                           num_units=self.hidden_size,
                           batch_size=bs,
                           input_size=self_att.get_shape().as_list()[-1],
                           keep_prob=self.keep_prob_ph)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = simple_attention(q,
                                    self.hidden_size,
                                    mask=self.q_mask,
                                    keep_prob=self.keep_prob_ph)
            pointer = PtrNet(cell_size=init.get_shape().as_list()[-1],
                             keep_prob=self.keep_prob_ph)
            if self.noans_token:
                noans_token = tf.Variable(
                    tf.random_uniform((match.get_shape().as_list()[-1], ),
                                      -0.1, 0.1), tf.float32)
                noans_token = tf.nn.dropout(noans_token,
                                            keep_prob=self.keep_prob_ph)
                noans_token = tf.expand_dims(tf.tile(
                    tf.expand_dims(noans_token, axis=0), [bs, 1]),
                                             axis=1)
                match = tf.concat([noans_token, match], axis=1)
                self.c_mask = tf.concat(
                    [tf.ones(shape=(bs, 1), dtype=tf.bool), self.c_mask],
                    axis=1)
            logits1, logits2 = pointer(init, match, self.hidden_size,
                                       self.c_mask)

        with tf.variable_scope("predict"):
            max_ans_length = tf.cast(tf.minimum(15, self.c_maxlen), tf.int64)
            outer_logits = tf.exp(
                tf.expand_dims(logits1, axis=2) +
                tf.expand_dims(logits2, axis=1))
            outer_logits = tf.matrix_band_part(outer_logits, 0, max_ans_length)
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, max_ans_length)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            self.yp_logits = tf.reduce_max(tf.reduce_max(outer_logits, axis=2),
                                           axis=1)
            if self.noans_token:
                self.yp_score = 1 - tf.nn.softmax(
                    logits1)[:, 0] * tf.nn.softmax(logits2)[:, 0]
            loss_1 = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            loss_2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                             labels=self.y2)
            self.loss = tf.reduce_mean(loss_1 + loss_2)
Ejemplo n.º 14
0
    def __init__(self,
                 n_tags,  # Features dimensions
                 token_emb_dim=None,
                 char_emb_dim=None,
                 capitalization_dim=None,
                 pos_features_dim=None,
                 additional_features=None,
                 net_type='rnn',  # Net architecture
                 cell_type='lstm',
                 use_cudnn_rnn=False,
                 two_dense_on_top=False,
                 n_hidden_list=(128,),
                 cnn_filter_width=7,
                 use_crf=False,
                 token_emb_mat=None,
                 char_emb_mat=None,
                 use_batch_norm=False,
                 dropout_keep_prob=0.5,  # Regularization
                 embeddings_dropout=False,
                 top_dropout=False,
                 intra_layer_dropout=False,
                 l2_reg=0.0,
                 clip_grad_norm=5.0,
                 learning_rate=3e-3,
                 gpu=None,
                 seed=None,
                 lr_drop_patience=5,
                 lr_drop_value=0.1,
                 **kwargs):
        tf.set_random_seed(seed)
        np.random.seed(seed)
        self._learning_rate = learning_rate
        self._lr_drop_patience = lr_drop_patience
        self._lr_drop_value = lr_drop_value
        self._add_training_placeholders(dropout_keep_prob, learning_rate)
        self._xs_ph_list = []
        self._y_ph = tf.placeholder(tf.int32, [None, None], name='y_ph')
        self._input_features = []

        # ================ Building input features =================

        # Token embeddings
        self._add_word_embeddings(token_emb_mat, token_emb_dim)

        # Masks for different lengths utterances
        self.mask_ph = self._add_mask()

        # Char embeddings using highway CNN with max pooling
        if char_emb_mat is not None and char_emb_dim is not None:
            self._add_char_embeddings(char_emb_mat)

        # Capitalization features
        if capitalization_dim is not None:
            self._add_capitalization(capitalization_dim)

        # Part of speech features
        if pos_features_dim is not None:
            self._add_pos(pos_features_dim)

        # Anything you want
        if additional_features is not None:
            self._add_additional_features(additional_features)

        features = tf.concat(self._input_features, axis=2)
        if embeddings_dropout:
            features = variational_dropout(features, self._dropout_ph)

        # ================== Building the network ==================

        if net_type == 'rnn':
            if use_cudnn_rnn:
                if l2_reg > 0:
                    raise Warning('cuDNN RNN are not l2 regularizable')
                units = self._build_cudnn_rnn(features, n_hidden_list, cell_type, intra_layer_dropout, self.mask_ph)
            else:
                units = self._build_rnn(features, n_hidden_list, cell_type, intra_layer_dropout, self.mask_ph,)
        elif net_type == 'cnn':
            units = self._build_cnn(features, n_hidden_list, cnn_filter_width, use_batch_norm)
        self._logits = self._build_top(units, n_tags, n_hidden_list[-1], top_dropout, two_dense_on_top)

        self.train_op, self.loss = self._build_train_predict(self._logits, self.mask_ph, n_tags,
                                                             use_crf, clip_grad_norm, l2_reg)
        self.predict = self.predict_crf if use_crf else self.predict_no_crf

        # ================= Initialize the session =================

        sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.allow_growth = True
        if gpu is not None:
            sess_config.gpu_options.visible_device_list = str(gpu)
        self.sess = tf.Session()   # TODO: add sess_config
        self.sess.run(tf.global_variables_initializer())
        super().__init__(**kwargs)
        self.load()
Ejemplo n.º 15
0
    def _init_graph(self):
        self._init_placeholders()

        self.word_emb = tf.get_variable("word_emb", initializer=tf.constant(self.init_word_emb, dtype=tf.float32),
                                        trainable=False)
        self.char_emb = tf.get_variable("char_emb", initializer=tf.constant(self.init_char_emb, dtype=tf.float32),
                                        trainable=self.train_char_emb)

        self.c_mask = tf.cast(self.c_ph, tf.bool)
        self.q_mask = tf.cast(self.q_ph, tf.bool)
        self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)
        self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)

        bs = tf.shape(self.c_ph)[0]
        self.c_maxlen = tf.reduce_max(self.c_len)
        self.q_maxlen = tf.reduce_max(self.q_len)
        self.c = tf.slice(self.c_ph, [0, 0], [bs, self.c_maxlen])
        self.q = tf.slice(self.q_ph, [0, 0], [bs, self.q_maxlen])
        self.c_mask = tf.slice(self.c_mask, [0, 0], [bs, self.c_maxlen])
        self.q_mask = tf.slice(self.q_mask, [0, 0], [bs, self.q_maxlen])
        self.cc = tf.slice(self.cc_ph, [0, 0, 0], [bs, self.c_maxlen, self.char_limit])
        self.qc = tf.slice(self.qc_ph, [0, 0, 0], [bs, self.q_maxlen, self.char_limit])
        self.cc_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.cc, tf.bool), tf.int32), axis=2), [-1])
        self.qc_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.qc, tf.bool), tf.int32), axis=2), [-1])
        self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit)
        self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit)
        self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen])
        self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen])

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                cc_emb = tf.reshape(tf.nn.embedding_lookup(self.char_emb, self.cc),
                                    [bs * self.c_maxlen, self.char_limit, self.char_emb_dim])
                qc_emb = tf.reshape(tf.nn.embedding_lookup(self.char_emb, self.qc),
                                    [bs * self.q_maxlen, self.char_limit, self.char_emb_dim])

                cc_emb = variational_dropout(cc_emb, keep_prob=self.keep_prob_ph)
                qc_emb = variational_dropout(qc_emb, keep_prob=self.keep_prob_ph)

                _, (state_fw, state_bw) = cudnn_bi_gru(cc_emb, self.char_hidden_size, seq_lengths=self.cc_len,
                                                       trainable_initial_states=True)
                cc_emb = tf.concat([state_fw, state_bw], axis=1)

                _, (state_fw, state_bw) = cudnn_bi_gru(qc_emb, self.char_hidden_size, seq_lengths=self.qc_len,
                                                       trainable_initial_states=True,
                                                       reuse=True)
                qc_emb = tf.concat([state_fw, state_bw], axis=1)

                cc_emb = tf.reshape(cc_emb, [bs, self.c_maxlen, 2 * self.char_hidden_size])
                qc_emb = tf.reshape(qc_emb, [bs, self.q_maxlen, 2 * self.char_hidden_size])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_emb, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_emb, self.q)

            c_emb = tf.concat([c_emb, cc_emb], axis=2)
            q_emb = tf.concat([q_emb, qc_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = self.GRU(num_layers=3, num_units=self.hidden_size, batch_size=bs,
                           input_size=c_emb.get_shape().as_list()[-1],
                           keep_prob=self.keep_prob_ph)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, att_size=self.attention_hidden_size,
                                   keep_prob=self.keep_prob_ph)
            rnn = self.GRU(num_layers=1, num_units=self.hidden_size, batch_size=bs,
                           input_size=qc_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(att, att, mask=self.c_mask, att_size=self.attention_hidden_size,
                                     keep_prob=self.keep_prob_ph)
            rnn = self.GRU(num_layers=1, num_units=self.hidden_size, batch_size=bs,
                           input_size=self_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = simple_attention(q, self.hidden_size, mask=self.q_mask, keep_prob=self.keep_prob_ph)
            pointer = PtrNet(cell_size=init.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph)
            logits1, logits2 = pointer(init, match, self.hidden_size, self.c_mask)

        with tf.variable_scope("predict"):
            outer_logits = tf.exp(tf.expand_dims(logits1, axis=2) + tf.expand_dims(logits2, axis=1))
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, tf.cast(tf.minimum(15, self.c_maxlen), tf.int64))
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            self.yp_logits = tf.reduce_max(tf.reduce_max(outer_logits, axis=2), axis=1)
            loss_1 = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1)
            loss_2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(loss_1 + loss_2)

        if self.weight_decay < 1.0:
            self.var_ema = tf.train.ExponentialMovingAverage(self.weight_decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.shadow_vars = []
                self.global_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.shadow_vars.append(v)
                        self.global_vars.append(var)
                self.assign_vars = []
                for g, v in zip(self.global_vars, self.shadow_vars):
                    self.assign_vars.append(tf.assign(g, v))
Ejemplo n.º 16
0
    def _build_body(self):
        # input projection
        _units = tf.layers.dense(self._features, self.dense_size,
                                 kernel_regularizer=tf.nn.l2_loss,
                                 kernel_initializer=xav())
        if self.attn:
            attn_scope = "attention_mechanism/{}".format(self.attn.type)
            with tf.variable_scope(attn_scope):
                if self.attn.type == 'general':
                    _attn_output = am.general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'bahdanau':
                    _attn_output = am.bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'cs_general':
                    _attn_output = am.cs_general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        depth=self.attn.depth,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'cs_bahdanau':
                    _attn_output = am.cs_bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        depth=self.attn.depth,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'light_general':
                    _attn_output = am.light_general_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                elif self.attn.type == 'light_bahdanau':
                    _attn_output = am.light_bahdanau_attention(
                        self._key,
                        self._emb_context,
                        hidden_size=self.attn.hidden_size,
                        projected_align=self.attn.projected_align)
                else:
                    raise ValueError("wrong value for attention mechanism type")
            _units = tf.concat([_units, _attn_output], -1)

        _units = tf_layers.variational_dropout(_units,
                                               keep_prob=self._dropout_keep_prob)

        # recurrent network unit
        _lstm_cell = tf.nn.rnn_cell.LSTMCell(self.hidden_size)
        _utter_lengths = tf.to_int32(tf.reduce_sum(self._utterance_mask, axis=-1))
        _output, _state = tf.nn.dynamic_rnn(_lstm_cell,
                                            _units,
                                            time_major=False,
                                            initial_state=self._initial_state,
                                            sequence_length=_utter_lengths)
        _output = tf.reshape(_output, (self._batch_size, -1, self.hidden_size))
        _output = tf_layers.variational_dropout(_output,
                                                keep_prob=self._dropout_keep_prob)
        # output projection
        _logits = tf.layers.dense(_output, self.action_size,
                                  kernel_regularizer=tf.nn.l2_loss,
                                  kernel_initializer=xav(), name='logits')
        return _logits, _state
Ejemplo n.º 17
0
    def _init_graph(self):
        self._init_placeholders()

        self.word_emb = tf.get_variable("word_emb", initializer=tf.constant(self.init_word_emb, dtype=tf.float32),
                                        trainable=False)
        self.char_emb = tf.get_variable("char_emb", initializer=tf.constant(self.init_char_emb, dtype=tf.float32),
                                        trainable=self.train_char_emb)

        self.c_mask = tf.cast(self.c_ph, tf.bool)
        self.q_mask = tf.cast(self.q_ph, tf.bool)
        self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)
        self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)

        bs = tf.shape(self.c_ph)[0]
        self.c_maxlen = tf.reduce_max(self.c_len)
        self.q_maxlen = tf.reduce_max(self.q_len)
        self.c = tf.slice(self.c_ph, [0, 0], [bs, self.c_maxlen])
        self.q = tf.slice(self.q_ph, [0, 0], [bs, self.q_maxlen])
        self.c_mask = tf.slice(self.c_mask, [0, 0], [bs, self.c_maxlen])
        self.q_mask = tf.slice(self.q_mask, [0, 0], [bs, self.q_maxlen])
        self.cc = tf.slice(self.cc_ph, [0, 0, 0], [bs, self.c_maxlen, self.char_limit])
        self.qc = tf.slice(self.qc_ph, [0, 0, 0], [bs, self.q_maxlen, self.char_limit])
        self.cc_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.cc, tf.bool), tf.int32), axis=2), [-1])
        self.qc_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.qc, tf.bool), tf.int32), axis=2), [-1])
        # to remove char sequences with len equal zero (padded tokens)
        self.cc_len = tf.maximum(tf.ones_like(self.cc_len), self.cc_len)
        self.qc_len = tf.maximum(tf.ones_like(self.qc_len), self.qc_len)
        self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit)
        self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit)
        self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen])
        self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen])

        if self.noans_token:
            # we use additional 'no answer' token to allow model not to answer on question
            # later we will add 'no answer' token as first token in context question-aware representation
            self.y1 = tf.one_hot(self.y1_ph, depth=self.context_limit + 1)
            self.y2 = tf.one_hot(self.y2_ph, depth=self.context_limit + 1)
            self.y1 = tf.slice(self.y1, [0, 0], [bs, self.c_maxlen + 1])
            self.y2 = tf.slice(self.y2, [0, 0], [bs, self.c_maxlen + 1])

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                cc_emb = tf.reshape(tf.nn.embedding_lookup(self.char_emb, self.cc),
                                    [bs * self.c_maxlen, self.char_limit, self.char_emb_dim])
                qc_emb = tf.reshape(tf.nn.embedding_lookup(self.char_emb, self.qc),
                                    [bs * self.q_maxlen, self.char_limit, self.char_emb_dim])

                cc_emb = variational_dropout(cc_emb, keep_prob=self.keep_prob_ph)
                qc_emb = variational_dropout(qc_emb, keep_prob=self.keep_prob_ph)

                _, (state_fw, state_bw) = cudnn_bi_gru(cc_emb, self.char_hidden_size, seq_lengths=self.cc_len,
                                                       trainable_initial_states=True)
                cc_emb = tf.concat([state_fw, state_bw], axis=1)

                _, (state_fw, state_bw) = cudnn_bi_gru(qc_emb, self.char_hidden_size, seq_lengths=self.qc_len,
                                                       trainable_initial_states=True,
                                                       reuse=True)
                qc_emb = tf.concat([state_fw, state_bw], axis=1)

                cc_emb = tf.reshape(cc_emb, [bs, self.c_maxlen, 2 * self.char_hidden_size])
                qc_emb = tf.reshape(qc_emb, [bs, self.q_maxlen, 2 * self.char_hidden_size])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_emb, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_emb, self.q)

            c_emb = tf.concat([c_emb, cc_emb], axis=2)
            q_emb = tf.concat([q_emb, qc_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = self.GRU(num_layers=3, num_units=self.hidden_size, batch_size=bs,
                           input_size=c_emb.get_shape().as_list()[-1],
                           keep_prob=self.keep_prob_ph)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, att_size=self.attention_hidden_size,
                                   keep_prob=self.keep_prob_ph)
            rnn = self.GRU(num_layers=1, num_units=self.hidden_size, batch_size=bs,
                           input_size=qc_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(att, att, mask=self.c_mask, att_size=self.attention_hidden_size,
                                     keep_prob=self.keep_prob_ph)
            rnn = self.GRU(num_layers=1, num_units=self.hidden_size, batch_size=bs,
                           input_size=self_att.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = simple_attention(q, self.hidden_size, mask=self.q_mask, keep_prob=self.keep_prob_ph)
            pointer = PtrNet(cell_size=init.get_shape().as_list()[-1], keep_prob=self.keep_prob_ph)
            if self.noans_token:
                noans_token = tf.Variable(tf.random_uniform((match.get_shape().as_list()[-1],), -0.1, 0.1), tf.float32)
                noans_token = tf.nn.dropout(noans_token, keep_prob=self.keep_prob_ph)
                noans_token = tf.expand_dims(tf.tile(tf.expand_dims(noans_token, axis=0), [bs, 1]), axis=1)
                match = tf.concat([noans_token, match], axis=1)
                self.c_mask = tf.concat([tf.ones(shape=(bs, 1), dtype=tf.bool), self.c_mask], axis=1)
            logits1, logits2 = pointer(init, match, self.hidden_size, self.c_mask)

        with tf.variable_scope("predict"):
            max_ans_length = tf.cast(tf.minimum(15, self.c_maxlen), tf.int64)
            outer_logits = tf.exp(tf.expand_dims(logits1, axis=2) + tf.expand_dims(logits2, axis=1))
            outer_logits = tf.matrix_band_part(outer_logits, 0, max_ans_length)
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, max_ans_length)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            self.yp_logits = tf.reduce_max(tf.reduce_max(outer_logits, axis=2), axis=1)
            if self.noans_token:
                self.yp_score = 1 - tf.nn.softmax(logits1)[:, 0] * tf.nn.softmax(logits2)[:, 0]
            loss_1 = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1)
            loss_2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(loss_1 + loss_2)
Ejemplo n.º 18
0
    def __init__(
            self,
            n_tags: int,  # Features dimensions
            token_emb_dim: int = None,
            char_emb_dim: int = None,
            capitalization_dim: int = None,
            pos_features_dim: int = None,
            additional_features: int = None,
            net_type: str = 'rnn',  # Net architecture
            cell_type: str = 'lstm',
            use_cudnn_rnn: bool = False,
            two_dense_on_top: bool = False,
            n_hidden_list: Tuple[int] = (128, ),
            cnn_filter_width: int = 7,
            use_crf: bool = False,
            token_emb_mat: np.ndarray = None,
            char_emb_mat: np.ndarray = None,
            use_batch_norm: bool = False,
            dropout_keep_prob: float = 0.5,  # Regularization
            embeddings_dropout: bool = False,
            top_dropout: bool = False,
            intra_layer_dropout: bool = False,
            l2_reg: float = 0.0,
            gpu: int = None,
            seed: int = None,
            **kwargs) -> None:
        tf.set_random_seed(seed)
        np.random.seed(seed)
        char_emb_mat = np.resize(char_emb_mat, (162, 100))
        assert n_tags != 0, 'Number of classes equal 0! It seems that vocabularies is not loaded.' \
                            ' Check that all vocabulary files are downloaded!'

        if 'learning_rate_drop_div' not in kwargs:
            kwargs['learning_rate_drop_div'] = 10.0
        if 'learning_rate_drop_patience' not in kwargs:
            kwargs['learning_rate_drop_patience'] = 5.0
        if 'clip_norm' not in kwargs:
            kwargs['clip_norm'] = 5.0
        super().__init__(**kwargs)
        self._add_training_placeholders(dropout_keep_prob)
        self._xs_ph_list = []
        self._y_ph = tf.placeholder(tf.int32, [None, None], name='y_ph')
        self._input_features = []

        # ================ Building input features =================

        # Token embeddings
        self._add_word_embeddings(token_emb_mat, token_emb_dim)

        # Masks for different lengths utterances
        self.mask_ph = self._add_mask()

        # Char embeddings using highway CNN with max pooling
        if char_emb_mat is not None and char_emb_dim is not None:
            self._add_char_embeddings(char_emb_mat)

        # Capitalization features
        if capitalization_dim is not None:
            self._add_capitalization(capitalization_dim)

        # Part of speech features
        if pos_features_dim is not None:
            self._add_pos(pos_features_dim)

        # Anything you want
        if additional_features is not None:
            self._add_additional_features(additional_features)

        features = tf.concat(self._input_features, axis=2)
        if embeddings_dropout:
            features = variational_dropout(features, self._dropout_ph)

        # ================== Building the network ==================

        if net_type == 'rnn':
            if use_cudnn_rnn:
                if l2_reg > 0:
                    log.warning('cuDNN RNN are not l2 regularizable')
                units = self._build_cudnn_rnn(features, n_hidden_list,
                                              cell_type, intra_layer_dropout,
                                              self.mask_ph)
            else:
                units = self._build_rnn(features, n_hidden_list, cell_type,
                                        intra_layer_dropout, self.mask_ph)
        elif net_type == 'cnn':
            units = self._build_cnn(features, n_hidden_list, cnn_filter_width,
                                    use_batch_norm)
        self._logits = self._build_top(units, n_tags, n_hidden_list[-1],
                                       top_dropout, two_dense_on_top)

        self.train_op, self.loss = self._build_train_predict(
            self._logits, self.mask_ph, n_tags, use_crf, l2_reg)
        self.predict = self.predict_crf if use_crf else self.predict_no_crf

        # ================= Initialize the session =================

        sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.allow_growth = True
        if gpu is not None:
            sess_config.gpu_options.visible_device_list = str(gpu)
        self.sess = tf.Session(config=sess_config)
        self.sess.run(tf.global_variables_initializer())
        self.load()
Ejemplo n.º 19
0
    def __init__(self,
                 n_tags: int,  # Features dimensions
                 token_emb_dim: int = None,
                 char_emb_dim: int = None,
                 capitalization_dim: int = None,
                 pos_features_dim: int = None,
                 additional_features: int = None,
                 net_type: str = 'rnn',  # Net architecture
                 cell_type: str = 'lstm',
                 use_cudnn_rnn: bool = False,
                 two_dense_on_top: bool = False,
                 n_hidden_list: Tuple[int] = (128,),
                 cnn_filter_width: int = 7,
                 use_crf: bool = False,
                 token_emb_mat: np.ndarray = None,
                 char_emb_mat: np.ndarray = None,
                 use_batch_norm: bool = False,
                 dropout_keep_prob: float = 0.5,  # Regularization
                 embeddings_dropout: bool = False,
                 top_dropout: bool = False,
                 intra_layer_dropout: bool = False,
                 l2_reg: float = 0.0,
                 clip_grad_norm: float = 5.0,
                 learning_rate: float = 3e-3,
                 gpu: int = None,
                 seed: int = None,
                 lr_drop_patience: int = 5,
                 lr_drop_value: float = 0.1,
                 **kwargs) -> None:
        tf.set_random_seed(seed)
        np.random.seed(seed)
        self._learning_rate = learning_rate
        self._lr_drop_patience = lr_drop_patience
        self._lr_drop_value = lr_drop_value
        self._add_training_placeholders(dropout_keep_prob, learning_rate)
        self._xs_ph_list = []
        self._y_ph = tf.placeholder(tf.int32, [None, None], name='y_ph')
        self._input_features = []

        # ================ Building input features =================

        # Token embeddings
        self._add_word_embeddings(token_emb_mat, token_emb_dim)

        # Masks for different lengths utterances
        self.mask_ph = self._add_mask()

        # Char embeddings using highway CNN with max pooling
        if char_emb_mat is not None and char_emb_dim is not None:
            self._add_char_embeddings(char_emb_mat)

        # Capitalization features
        if capitalization_dim is not None:
            self._add_capitalization(capitalization_dim)

        # Part of speech features
        if pos_features_dim is not None:
            self._add_pos(pos_features_dim)

        # Anything you want
        if additional_features is not None:
            self._add_additional_features(additional_features)

        features = tf.concat(self._input_features, axis=2)
        if embeddings_dropout:
            features = variational_dropout(features, self._dropout_ph)

        # ================== Building the network ==================

        if net_type == 'rnn':
            if use_cudnn_rnn:
                if l2_reg > 0:
                    raise Warning('cuDNN RNN are not l2 regularizable')
                units = self._build_cudnn_rnn(features, n_hidden_list, cell_type, intra_layer_dropout, self.mask_ph)
            else:
                units = self._build_rnn(features, n_hidden_list, cell_type, intra_layer_dropout, self.mask_ph,)
        elif net_type == 'cnn':
            units = self._build_cnn(features, n_hidden_list, cnn_filter_width, use_batch_norm)
        self._logits = self._build_top(units, n_tags, n_hidden_list[-1], top_dropout, two_dense_on_top)

        self.train_op, self.loss = self._build_train_predict(self._logits, self.mask_ph, n_tags,
                                                             use_crf, clip_grad_norm, l2_reg)
        self.predict = self.predict_crf if use_crf else self.predict_no_crf

        # ================= Initialize the session =================

        sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.allow_growth = True
        if gpu is not None:
            sess_config.gpu_options.visible_device_list = str(gpu)
        self.sess = tf.Session()   # TODO: add sess_config
        self.sess.run(tf.global_variables_initializer())
        super().__init__(**kwargs)
        self.load()