Esempio n. 1
0
      def __init__(self, name, dep_reprs, head_reprs, roll_direction=0):
        self.name = name
        with tf.variable_scope(name + '/predictions'):
          # apply hidden layers to the input representations
          arc_dep_hidden = model_helpers.project(
              dep_reprs, config.projection_size, 'arc_dep_hidden')
          arc_head_hidden = model_helpers.project(
              head_reprs, config.projection_size, 'arc_head_hidden')
          arc_dep_hidden = tf.nn.relu(arc_dep_hidden)
          arc_head_hidden = tf.nn.relu(arc_head_hidden)
          arc_head_hidden = tf.nn.dropout(arc_head_hidden, inputs.keep_prob)
          arc_dep_hidden = tf.nn.dropout(arc_dep_hidden, inputs.keep_prob)

          # bilinear classifier excluding the final dot product
          arc_head = tf.layers.dense(
              arc_head_hidden, config.depparse_projection_size, name='arc_head')
          W = tf.get_variable('shared_W',
                              shape=[config.projection_size, n_classes,
                                     config.depparse_projection_size])
          Wr = tf.get_variable('relation_specific_W',
                               shape=[config.projection_size,
                                      config.depparse_projection_size])
          Wr_proj = tf.tile(tf.expand_dims(Wr, axis=-2), [1, n_classes, 1])
          W += Wr_proj
          arc_dep = tf.tensordot(arc_dep_hidden, W, axes=[[-1], [0]])
          shape = tf.shape(arc_dep)
          arc_dep = tf.reshape(arc_dep,
                               [shape[0], -1, config.depparse_projection_size])

          # apply the transformer scaling trick to prevent dot products from
          # getting too large (possibly not necessary)
          scale = np.power(
              config.depparse_projection_size, 0.25).astype('float32')
          scale = tf.get_variable('scale', initializer=scale, dtype=tf.float32)
          arc_dep /= scale
          arc_head /= scale

          # compute the scores for each candidate arc
          word_scores = tf.matmul(arc_head, arc_dep, transpose_b=True)
          root_scores = tf.layers.dense(arc_head, n_classes, name='root_score')
          arc_scores = tf.concat([root_scores, word_scores], axis=-1)

          # disallow the model from making impossible predictions
          mask = inputs.mask
          mask_shape = tf.shape(mask)
          mask = tf.tile(tf.expand_dims(mask, -1), [1, 1, n_classes])
          mask = tf.reshape(mask, [-1, mask_shape[1] * n_classes])
          mask = tf.concat([tf.ones((mask_shape[0], 1)),
                            tf.zeros((mask_shape[0], n_classes - 1)), mask],
                           axis=1)
          mask = tf.tile(tf.expand_dims(mask, 1), [1, mask_shape[1], 1])
          arc_scores += (mask - 1) * 100.0

          self.logits = arc_scores
          self.loss = model_helpers.masked_ce_loss(
              self.logits, labels, inputs.mask,
              roll_direction=roll_direction)
Esempio n. 2
0
      def __init__(self, name, input_reprs, roll_direction=0, activate=True):
        self.name = name
        with tf.variable_scope(name + '/predictions'):
          projected = model_helpers.project(input_reprs, config.projection_size)
          if activate:
            projected = tf.nn.relu(projected)
          self.logits = tf.layers.dense(projected, n_classes, name='predict')

        targets = labels
        targets *= (1 - inputs.label_smoothing)
        targets += inputs.label_smoothing / n_classes
        self.loss = model_helpers.masked_ce_loss(
            self.logits, targets, inputs.mask, roll_direction=roll_direction)
Esempio n. 3
0
      def __init__(self, name, input_reprs, roll_direction=0, activate=True):
        self.name = name
        with tf.variable_scope(name + '/predictions'):
          projected = model_helpers.project(input_reprs, config.projection_size)
          if activate:
            projected = tf.nn.relu(projected)
          self.logits = tf.layers.dense(projected, n_classes, name='predict')

        targets = labels
        targets *= (1 - inputs.label_smoothing)
        targets += inputs.label_smoothing / n_classes
        self.loss = model_helpers.masked_ce_loss(
            self.logits, targets, inputs.mask, roll_direction=roll_direction)
Esempio n. 4
0
            def __init__(self,
                         name,
                         input_reprs,
                         roll_direction=0,
                         activate=True,
                         is_translate=False,
                         word_in=None,
                         encoder_reprs=encoder.bi_reprs):
                self.name = name
                with tf.variable_scope(name + '/predictions'):
                    #decoder_state = tf.layers.dense(input_reprs, config.projection_size, name='encoder_to_decoder')
                    decoder_state = input_reprs

                    with tf.variable_scope('word_embeddings_vi'):
                        word_embedding_matrix = tf.get_variable(
                            'word_embedding_matrix_vi',
                            initializer=pretrained_embeddings_vi)
                        if is_translate:
                            word_embeddings = tf.nn.embedding_lookup(
                                word_embedding_matrix, word_in)
                        else:
                            word_embeddings = tf.nn.embedding_lookup(
                                word_embedding_matrix, words_tgt_in)
                        word_embeddings = tf.nn.dropout(
                            word_embeddings, inputs.keep_prob)
                        word_embeddings *= tf.get_variable('emb_scale',
                                                           initializer=1.0)

                    decoder_lstm = model_helpers.lstm_cell(
                        config.bidirectional_sizes[0], inputs.keep_prob,
                        config.projection_size)

                    decoder_output_layer = tf.layers.Dense(n_classes,
                                                           name='predict')

                    if not is_translate:
                        attention_mechanism = LuongAttention(
                            num_units=config.attention_units,
                            memory=encoder_reprs,
                            memory_sequence_length=size_sr,
                            scale=True)
                        attention_cell = AttentionWrapper(
                            decoder_lstm,
                            attention_mechanism,
                            attention_layer_size=config.attention_units)

                        batch_size = tf.shape(words_tgt_in)[0]
                        decoder_initial_state = attention_cell.zero_state(
                            dtype=tf.float32,
                            batch_size=batch_size * config.beam_width)
                        decoder_state = decoder_initial_state.clone(
                            cell_state=decoder_state)

                        helper = tf.contrib.seq2seq.TrainingHelper(
                            word_embeddings, size_tgt)

                        decoder = tf.contrib.seq2seq.BasicDecoder(
                            attention_cell, helper, decoder_state,
                            decoder_output_layer)

                        outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                            decoder)
                        # swap_memory=True)

                        self.logits = outputs.rnn_output
                    else:
                        if config.decode_mode == 'greedy':
                            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                                word_embedding_matrix,
                                [embeddings.START, embeddings.START],
                                embeddings.END)

                            decoder = tf.contrib.seq2seq.BasicDecoder(
                                decoder_lstm, helper, decoder_state,
                                decoder_output_layer)
                        elif config.decode_mode == 'beam':
                            encoder_reprs = tf.contrib.seq2seq.tile_batch(
                                encoder_reprs, multiplier=config.beam_width)
                            decoder_state = tf.contrib.seq2seq.tile_batch(
                                decoder_state, multiplier=config.beam_width)
                            size_src = tf.contrib.seq2seq.tile_batch(
                                size_sr, multiplier=config.beam_width)

                            attention_mechanism = LuongAttention(
                                num_units=config.attention_units,
                                memory=encoder_reprs,
                                memory_sequence_length=size_src,
                                scale=True)
                            attention_cell = AttentionWrapper(
                                decoder_lstm,
                                attention_mechanism,
                                attention_layer_size=config.attention_units)

                            batch_size = 2
                            decoder_initial_state = attention_cell.zero_state(
                                dtype=tf.float32,
                                batch_size=batch_size * config.beam_width)
                            decoder_state = decoder_initial_state.clone(
                                cell_state=decoder_state)

                            #decoder_state = tf.contrib.seq2seq.tile_batch(
                            #  decoder_state, multiplier=config.beam_width)

                            decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                                cell=attention_cell,
                                embedding=word_embedding_matrix,
                                start_tokens=[
                                    embeddings.START, embeddings.START
                                ],
                                end_token=embeddings.END,
                                initial_state=decoder_state,
                                beam_width=config.beam_width,
                                output_layer=decoder_output_layer)

                        outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(
                            decoder,
                            maximum_iterations=config.max_translate_length)
                        #swap_memory=True)

                        if config.decode_mode == 'greedy':
                            self.sample_ids = outputs.sample_id
                        elif config.decode_mode == 'beam':
                            self.sample_ids = outputs.predicted_ids
                    '''
          outputs, state = tf.nn.dynamic_rnn(
            model_helpers.lstm_cell(config.bidirectional_sizes[0], inputs.keep_prob,
                                    config.projection_size),
            word_embeddings,
            initial_state=decoder_state,
            dtype=tf.float32,
            sequence_length=size_tgt,
            scope='predictlstm'
          )
          '''

                    self.state = state

                    #self.logits = tf.layers.dense(outputs, n_classes, name='predict')
                    #self.logits = tf.layers.dense(outputs.rnn_output, n_classes, name='predict')

                if is_translate:
                    return

                targets = words_tgt_out
                targets *= (1 - inputs.label_smoothing)
                targets += inputs.label_smoothing / n_classes
                self.loss = model_helpers.masked_ce_loss(
                    self.logits, targets, inputs.mask)