Example #1
0
def gather_indexes(sequence_tensor, positions):
    """Gathers the vectors at the specific positions over a minibatch."""
    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
    flat_positions = tf.reshape(positions + flat_offsets, [-1])
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                      [batch_size * seq_length, width])
    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
    return output_tensor
Example #2
0
    def gather_indexes(self):
        sequence_shape = modeling.get_shape_list(self.input_tensor,
                                                 expected_rank=3)
        batch_size = sequence_shape[0]
        seq_length = sequence_shape[1]
        width = sequence_shape[2]

        flat_offsets = tf.reshape(
            tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
        flat_positions = tf.reshape(self.masked_lm_positions + flat_offsets,
                                    [-1])
        flat_sequence_tensor = tf.reshape(self.input_tensor,
                                          [batch_size * seq_length, width])
        output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
        return output_tensor
def compute_joint_mlp_logits(sequence, max_span_length):
    """Computes joint span (start, end) logits from sequence input."""
    batch_size, seq_length, hidden_size = modeling.get_shape_list(
        sequence, expected_rank=3)

    projection_size = hidden_size  # This seems to be a reasonable setting.

    with tf.variable_scope("joint_span"):
        projection = tf.layers.dense(
            sequence,
            projection_size * 2,
            activation=None,
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
            name="projection")
        start_projection, end_projection = tf.split(projection, 2, axis=-1)

        # 1. The start representations are tiled max_answer_length times.
        # TODO(danielandor): Use the mask to compute an optimal span list.
        starts = tf.reshape(start_projection,
                            [batch_size * seq_length, 1, projection_size])
        starts = tf.tile(starts, [1, max_span_length, 1])
        starts = tf.reshape(
            starts,
            [batch_size, seq_length * max_span_length, projection_size])

        # 2. To make the end representations, we compute band diagonal indices and
        #    perform a batched gather.
        seqs = tf.expand_dims(tf.range(seq_length), 1)
        offsets = tf.expand_dims(tf.range(max_span_length), 0)
        indices = seqs + offsets  # uses broadcasting
        indices.shape.assert_is_compatible_with((seq_length, max_span_length))
        indices = tf.reshape(indices, [1, seq_length * max_span_length])
        indices = tf.tile(indices, [batch_size, 1])
        indices = tf.minimum(indices, seq_length - 1)  # clips indices
        ends = tf.batch_gather(end_projection, indices)

        # 3. The final step adds the starts and ends.
        ends.shape.assert_is_compatible_with(starts.shape)
        inputs = starts + ends
        inputs = modeling.gelu(inputs)  # Bias is already in the projection.
        inputs = contrib_layers.layer_norm(inputs)
        start_logits = tf.layers.dense(
            inputs,
            1,
            activation=None,
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
            name="logits")
    return tf.reshape(start_logits, [batch_size, seq_length, max_span_length])
def mask_joint_logits(input_mask, start_end_logits):
    """Masks logits based on input mask and valid start/end combinations."""
    _, _, length = modeling.get_shape_list(start_end_logits, expected_rank=3)

    mask = tf.TensorArray(input_mask.dtype, size=length, dynamic_size=False)
    for i in range(length):
        mask = mask.write(i, input_mask)
        # The permitted span length is determined by the existing mask combined
        # with its being shifted up by one.
        input_mask = input_mask * tf.pad(input_mask[:, 1:], [[0, 0], [0, 1]])
    mask = mask.stack()
    mask = tf.transpose(mask, [1, 2, 0])
    mask.shape.assert_is_compatible_with(start_end_logits.shape)

    start_end_logits -= 1e6 * tf.cast(1 - mask, tf.float32)
    return start_end_logits
Example #5
0
    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        example = tf.parse_single_example(record, name_to_features)

        # Convert image to float tensor.
        image = example["image"]
        image_decoded = tf.image.decode_jpeg(image, channels=3)
        image_decoded.set_shape([None, None, 3])
        image_float = tf.image.convert_image_dtype(image_decoded, tf.float32)
        image_resized = tf.image.resize_image_with_pad(image_float, IMG_HEIGHT,
                                                       IMG_WIDTH)
        example["image"] = tf.reshape(image_resized,
                                      [IMG_HEIGHT, IMG_WIDTH, 3])

        # Get bboxes.
        if FLAGS.use_bboxes:
            example["bbox_pos"] = tf.to_int32(example["bbox_pos"])
            bboxes = []
            for idx in range(FLAGS.max_num_bboxes):
                bboxes.append(
                    parse_bounding_box(IMG_HEIGHT, IMG_WIDTH, image_float,
                                       example["bbox_pos"][idx, :]))
            example["bboxes"] = tf.stack(bboxes)

            if FLAGS.use_bbox_position:
                # Resized bboxes.
                y, x, bbox_height, bbox_width = tf.unstack(example["bbox_pos"],
                                                           axis=1)
                orig_height, orig_width = modeling.get_shape_list(
                    image_float)[:2]
                example["bbox_pos"] = tf.cast(tf.stack([
                    IMG_HEIGHT * y / orig_height, IMG_WIDTH * x / orig_width,
                    IMG_HEIGHT * bbox_height / orig_height,
                    IMG_WIDTH * bbox_width / orig_width
                ], 1),
                                              dtype=tf.int32)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t

        return example
Example #6
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 label_ids, num_labels):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids)

    output_layer = model.get_sequence_output()

    output_layer_shape = modeling.get_shape_list(output_layer, expected_rank=3)
    #batch_size = output_layer_shape[0]
    #seq_length = output_layer_shape[1]
    hidden_size = output_layer_shape[2]

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        #output_layer = tf.reshape(output_layer, [batch_size*seq_length, hidden_size])
        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        probabilities = tf.nn.softmax(logits, axis=-1)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=num_labels,
                                    dtype=tf.float32)

        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        per_example_loss = per_example_loss * tf.cast(input_mask, tf.float32)
        loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, logits, probabilities, predictions)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings):
    """Creates a classification model to classify if para contains answer for given question and position of start and end token """
    model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids,
                               input_mask=input_mask, token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)
    # using the model exactly same as the one used for SQUAD but one layer of classification can be added to do classification task of context contains answer or not
    final_hidden = model.get_sequence_output()

    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    output_weights = tf.get_variable("cls/emr/output_weights", [2, hidden_size],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))
    output_bias = tf.get_variable("cls/emr/output_bias", [2], initializer=tf.zeros_initializer())
    # to be able to initialize checkpoints with SQUAD trained BERT QA model, replace 'emr' by 'squad' in names of above variables
    final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size])

    logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    logits = tf.reshape(logits, [batch_size, seq_length, 2])
    logits = tf.transpose(logits, [2, 0, 1])

    unstacked_logits = tf.unstack(logits, axis=0)
    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    has_answer_output_layer = model.get_pooled_output()
    has_answer_hidden_size = has_answer_output_layer.shape[-1].value
    has_answer_types = 2  # YES, NO
    has_answer_output_weights = tf.get_variable(
        "has_answer_output_weights", [has_answer_types, has_answer_hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    has_answer_output_bias = tf.get_variable(
        "has_answer_output_bias", [has_answer_types],
        initializer=tf.zeros_initializer())

    has_answer_logits = tf.matmul(has_answer_output_layer, has_answer_output_weights, transpose_b=True)
    has_answer_logits = tf.nn.bias_add(has_answer_logits,has_answer_output_bias)
    return (start_logits, end_logits, has_answer_logits)
Example #8
0
def create_cnn_model(is_training,
                     token_embeddings,
                     config,
                     batch_size,
                     segment_ids=None,
                     name="CNN"):
    """Creates a classification model."""

    input_shape = get_shape_list(token_embeddings, expected_rank=3)
    # batch_size = input_shape[0]
    seq_length = input_shape[1]
    hidden_size = input_shape[2]

    channels_in = 1

    conv = tf.reshape(token_embeddings,
                      [batch_size, seq_length, hidden_size, channels_in])
    for i, filter_shape in enumerate(config.filter_shapes):
        pool_shape = config.pool_shapes[i]
        conv = _conv_layer(conv, filter_shape, pool_shape, channels_in,
                           config.channels_out[i], ('convfilter%d' % i))

    w_out = (conv.shape[-1] * conv.shape[-2]).value

    n_positions = 2  # start and end logits
    wd1 = tf.Variable(tf.truncated_normal([w_out, n_positions], stddev=0.03),
                      name='wd1')
    bd1 = tf.Variable(tf.truncated_normal([n_positions], stddev=0.01),
                      name='bd1')

    conv = tf.reshape(conv, [batch_size * seq_length, w_out])
    logits = tf.matmul(conv, wd1)
    logits = tf.nn.bias_add(logits, bd1)

    logits = tf.reshape(logits, [batch_size, seq_length, n_positions])
    logits = tf.transpose(logits, [2, 0, 1])

    unstacked_logits = tf.unstack(logits, axis=0)

    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    return (start_logits, end_logits)
Example #9
0
    def build_block_attn_single_arg(self, scope=None):
        tf.logging.debug("Building single arg pipeline in BERT Embedding")
        self.arg = tf.placeholder(tf.int32, [None, self.max_arg_length],
                                  name='arg')
        self.label = tf.placeholder(tf.int32, [None], name="label")

        # TODO: max_len for conn???
        self.conn = tf.placeholder(tf.int32, [None, self.max_arg_length],
                                   name="conn")

        self.arg_attn_mask = tf.placeholder(tf.int32,
                                            [None, self.max_arg_length],
                                            name="arg_attn_mask")

        segment_ids = tf.zeros_like(self.arg, dtype=tf.int32)

        bert_model = modeling.BertModel(config=self.bert_config,
                                        is_training=self.is_training,
                                        input_ids=self.arg,
                                        input_mask=self.arg_attn_mask,
                                        token_type_ids=segment_ids,
                                        use_one_hot_embeddings=False,
                                        scope='bert')
        bert_arg = bert_model.get_sequence_output()

        # # custom
        # self.build_bert_model(input_ids=self.arg,
        #                       input_mask=self.arg_attn_mask,
        #                       token_type_ids=segment_ids)
        #
        # bert_arg = self.sequence_output

        input_shape = modeling.get_shape_list(bert_arg, expected_rank=3)
        batch_size = input_shape[0]
        seq_length = input_shape[1]
        width = input_shape[2]

        self.bert_arg = tf.reshape(
            bert_arg, shape=[batch_size / 2, seq_length * 2, width])
        self.bert_mask_concat = tf.reshape(
            self.arg_attn_mask, shape=[batch_size / 2, seq_length * 2])
Example #10
0
def run_one_hot_embeddings(one_hot_input_ids, config):
    """Extract only the word embeddings of the original BERT model."""
    with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE):
        with tf.variable_scope("embeddings"):
            # branched from modeling.embedding_lookup
            embedding_table = tf.get_variable(
                name="word_embeddings",
                shape=[config.vocab_size, config.hidden_size],
                initializer=modeling.create_initializer(
                    config.initializer_range))

            flat_input_ids = tf.reshape(one_hot_input_ids,
                                        [-1, config.vocab_size])
            output = tf.matmul(flat_input_ids, embedding_table)

            input_shape = modeling.get_shape_list(one_hot_input_ids)

            output = tf.reshape(output,
                                input_shape[0:-1] + [config.hidden_size])

            return (output, embedding_table)
Example #11
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    final_hidden = model.get_sequence_output()

    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    output_weights = tf.get_variable(
        "cls/squad/output_weights", [2, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())

    final_hidden_matrix = tf.reshape(final_hidden,
                                     [batch_size * seq_length, hidden_size])
    logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    logits = tf.reshape(logits, [batch_size, seq_length, 2])
    logits = tf.transpose(logits, [2, 0, 1])

    unstacked_logits = tf.unstack(logits, axis=0)

    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    return (start_logits, end_logits)
def dense_layer_3d(input_tensor,
                   num_attention_heads,
                   size_per_head,
                   initializer,
                   activation,
                   name=None):
    """A dense layer with 3D kernel.

  Args:
    input_tensor: float Tensor of shape [batch, seq_length, hidden_size].
    num_attention_heads: Number of attention heads.
    size_per_head: The size per attention head.
    initializer: Kernel initializer.
    activation: Actication function.
    name: The name scope of this layer.

  Returns:
    float logits Tensor.
  """

    last_dim = modeling.get_shape_list(input_tensor)[-1]

    with tf.variable_scope(name):
        w = tf.get_variable(
            name="kernel",
            shape=[last_dim, num_attention_heads * size_per_head],
            initializer=initializer)
        w = tf.reshape(w, [last_dim, num_attention_heads, size_per_head])
        b = tf.get_variable(name="bias",
                            shape=[num_attention_heads * size_per_head],
                            initializer=tf.zeros_initializer)
        b = tf.reshape(b, [num_attention_heads, size_per_head])
        ret = tf.einsum("abc,cde->abde", input_tensor, w)
        ret += b
        if activation is not None:
            return activation(ret)
        else:
            return ret
Example #13
0
def create_fully_connected_model(is_training, token_embeddings, config,
                                 batch_size, segment_ids):
    """Creates a classification model."""
    input_shape = get_shape_list(token_embeddings, expected_rank=3)
    # batch_size = input_shape[0]
    seq_length = input_shape[1]
    hidden_size = input_shape[2]

    channels_in = 1

    n_positions = 2  # start and end logits

    output_weights = tf.get_variable(
        "cls/squad/output_weights", [2, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("cls/squad/output_bias", [2],
                                  initializer=tf.zeros_initializer())

    if config.mask_questions:
        token_embeddings = mask_questions_batch(token_embeddings, segment_ids,
                                                hidden_size)

    final_hidden_matrix = tf.reshape(token_embeddings,
                                     [batch_size * seq_length, hidden_size])
    logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    logits = tf.reshape(logits, [batch_size, seq_length, 2])
    logits = tf.transpose(logits, [2, 0, 1])

    unstacked_logits = tf.unstack(logits, axis=0)

    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    return (start_logits, end_logits)
Example #14
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        label = features["label"]
        is_training = (mode == tf_estimator.ModeKeys.TRAIN)
        module = hub.Module(
            "https://tfhub.dev/google/imagenet/resnet_v2_152/feature_vector/3",
            trainable=FLAGS.trainable_resnet)
        batch_size = params["batch_size"]
        height, width = hub.get_expected_image_size(module)

        # First extract a global image vector.
        image_vector = module(tf.reshape(features["image"],
                                         [batch_size, height, width, 3]),
                              signature="image_feature_vector",
                              as_dict=True)["default"]
        image_vector = tf.reshape(image_vector,
                                  [batch_size, 1, FLAGS.image_vector_dim])

        # The global image vector is added in the position of the [IMAGE] token,
        # which comes right after the [CLS] token.
        image_embeddings = tf.concat([
            tf.zeros([batch_size, 1, FLAGS.image_vector_dim]), image_vector,
            tf.zeros(
                [batch_size, FLAGS.max_seq_length - 2, FLAGS.image_vector_dim])
        ],
                                     axis=1)

        if FLAGS.use_bboxes:
            # Then extract an image vector for each of the bounding boxes (at most
            # FLAGS.max_num_bboxes).
            boxes_vectors = module(tf.reshape(
                features["bboxes"],
                [batch_size * FLAGS.max_num_bboxes, height, width, 3]),
                                   signature="image_feature_vector",
                                   as_dict=True)["default"]
            boxes_vectors = tf.reshape(
                boxes_vectors,
                [batch_size, FLAGS.max_num_bboxes, FLAGS.image_vector_dim])

            if FLAGS.use_bbox_position:
                tf.logging.info("Embedding bbox position with 56 positions.")

                # Position embedding for bbox location.
                def _make_position_embedding_table(scope):
                    # 56 is 224 / 4
                    return tf.get_variable(
                        scope, [56, FLAGS.image_vector_dim // 4],
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.02))

                position_x_embeds = _make_position_embedding_table(
                    "position_x_embeddings")
                position_y_embeds = _make_position_embedding_table(
                    "position_y_embeddings")

                # bbox_pos features are top left corner in image, and height and width.
                bbox_pos = features["bbox_pos"] // 4
                y1 = tf.one_hot(bbox_pos[:, :, 0], 56)
                x1 = tf.one_hot(bbox_pos[:, :, 1], 56)
                y2 = tf.one_hot(bbox_pos[:, :, 0] + bbox_pos[:, :, 2], 56)
                x2 = tf.one_hot(bbox_pos[:, :, 1] + bbox_pos[:, :, 3], 56)

                bbox_x_embeds = tf.einsum("bixc,cd->bixd",
                                          tf.stack([x1, x2], axis=2),
                                          position_x_embeds)
                bbox_y_embeds = tf.einsum("biyc,cd->biyd",
                                          tf.stack([y1, y2], axis=2),
                                          position_y_embeds)
                # [batch_size, max_num_bboxes, image_vector_size]
                bbox_pos_embeds = tf.concat([
                    tf.reshape(bbox_x_embeds,
                               [batch_size, FLAGS.max_num_bboxes, -1]),
                    tf.reshape(bbox_y_embeds,
                               [batch_size, FLAGS.max_num_bboxes, -1])
                ], -1)
                boxes_vectors += bbox_pos_embeds

            # Now place the image vectors of each bounding box in the position of each
            # special token that references that bounding box. The letters in the
            # einsum mean the following:
            #   b: batch element index
            #   t: token index
            #   i: bounding box index
            #   d: depth of image representation
            image_embeddings += tf.einsum(
                "bti,bid->btd",
                tf.one_hot(features["bbox_idx"], FLAGS.max_num_bboxes),
                boxes_vectors)

        with tf.variable_scope("bert") as scope:
            # This is just like a regular BERT model, but the given image embeddings
            # are added to the input word piece embeddings.
            model = B2T2Model(config=bert_config,
                              is_training=is_training,
                              input_ids=features["input_ids"],
                              image_embeddings=image_embeddings,
                              input_mask=features["input_mask"],
                              token_type_ids=features["segment_ids"],
                              use_one_hot_embeddings=use_one_hot_embeddings,
                              scope=scope)
            output_weights = tf.get_variable(
                "output_weights",
                [FLAGS.num_output_labels, bert_config.hidden_size],
                initializer=tf.truncated_normal_initializer(stddev=0.02))
            output_bias = tf.get_variable("output_bias",
                                          [FLAGS.num_output_labels],
                                          initializer=tf.zeros_initializer())
            logits = tf.matmul(model.get_pooled_output(),
                               output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf_estimator.ModeKeys.TRAIN:
            loss = 0.0
            if FLAGS.negative_loss:
                tf.logging.info("Using negative loss.")
                loss += tf.losses.sparse_softmax_cross_entropy(
                    label, logits, reduction=tf.losses.Reduction.MEAN)

            if FLAGS.mask_lm_loss:
                tf.logging.info("Using mask LM loss.")
                # Don't use mask LM for negative captions.
                masked_lm_label_weights = features["masked_lm_weights"]
                masked_lm_label_weights *= tf.expand_dims(
                    tf.cast(label, tf.float32), -1)
                (masked_lm_loss, _, _) = run_pretraining.get_masked_lm_output(
                    bert_config, model.get_sequence_output(),
                    model.get_embedding_table(),
                    features["masked_lm_positions"], features["masked_lm_ids"],
                    masked_lm_label_weights)
                loss += masked_lm_loss

            train_op = optimization.create_optimizer(loss, learning_rate,
                                                     num_train_steps,
                                                     num_warmup_steps, use_tpu)

            output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode,
                                                       loss=loss,
                                                       train_op=train_op,
                                                       scaffold_fn=scaffold_fn)

        elif mode == tf_estimator.ModeKeys.PREDICT:
            predictions = {
                # Image, annotation and choice identifiers.
                "img_id": features["img_id"],
                "annot_id": features["annot_id"],

                # Gold label.
                "label": label,
            }

            if FLAGS.mask_lm_loss:
                # We don't care about masked_lm_weights for prediction.
                _, _, masked_lm_log_probs = run_pretraining.get_masked_lm_output(
                    bert_config, model.get_sequence_output(),
                    model.get_embedding_table(),
                    features["masked_lm_positions"], features["masked_lm_ids"],
                    features["masked_lm_weights"])
                # [batch_size * max_preds_per_seq]
                masked_lm_preds = tf.argmax(masked_lm_log_probs,
                                            axis=-1,
                                            output_type=tf.int32)
                batch_size, max_preds_per_seq = modeling.get_shape_list(
                    features["masked_lm_positions"])
                masked_lm_preds = tf.reshape(masked_lm_preds,
                                             [batch_size, max_preds_per_seq])
                predictions.update({
                    "input_ids": features["input_ids"],
                    "masked_lm_ids": features["masked_lm_ids"],
                    "masked_lm_preds": masked_lm_preds
                })

            if FLAGS.has_choice_id:
                predictions["choice_id"] = features["choice_id"]

            if FLAGS.do_output_logits:
                predictions["output_logits"] = logits
            else:
                predictions["output_label"] = tf.argmax(logits, axis=-1)

            output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode,
                                                       predictions=predictions,
                                                       scaffold_fn=scaffold_fn)
        else:
            raise ValueError("Only TRAIN and PREDICT modes are supported: %s" %
                             (mode))

        return output_spec
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 span_encoding, max_answer_length, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # Get the logits for the start and end predictions.
    final_hidden = model.get_sequence_output()

    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    if span_encoding == "independent":
        output_weights = tf.get_variable(
            "cls/coqa/output_weights", [2, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable("cls/coqa/output_bias", [2],
                                      initializer=tf.zeros_initializer())

        final_hidden_matrix = tf.reshape(
            final_hidden, [batch_size * seq_length, hidden_size])
        logits = tf.matmul(final_hidden_matrix,
                           output_weights,
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

        logits = tf.reshape(logits, [batch_size, seq_length, 2])
        start_logits, end_logits = tf.unstack(logits, axis=2)
    elif span_encoding == "concat-mlp":
        with tf.variable_scope("coqa"):
            if is_training:
                # The batch size can be variable during inference.
                final_hidden.shape.assert_is_compatible_with(
                    (batch_size, seq_length, hidden_size))
            start_logits = compute_joint_mlp_logits(final_hidden,
                                                    max_answer_length)
            start_logits = mask_joint_logits(input_mask, start_logits)
            end_logits = tf.zeros([batch_size], dtype=tf.float32)  # dummy
    else:
        raise ValueError("Unknown span_encoding: %s" % span_encoding)

    # Get the logits for the answer type prediction.
    # TODO(epitler): Try variants here.
    answer_type_output_layer = model.get_pooled_output()
    answer_type_hidden_size = answer_type_output_layer.shape[-1].value

    num_answer_types = 5  # YES, NO, UNKNOWN, EXTRACTIVE, ABSTRACTIVE
    answer_type_output_weights = tf.get_variable(
        "answer_type_output_weights",
        [num_answer_types, answer_type_hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    answer_type_output_bias = tf.get_variable(
        "answer_type_output_bias", [num_answer_types],
        initializer=tf.zeros_initializer())

    answer_type_logits = tf.matmul(answer_type_output_layer,
                                   answer_type_output_weights,
                                   transpose_b=True)
    answer_type_logits = tf.nn.bias_add(answer_type_logits,
                                        answer_type_output_bias)
    return (start_logits, end_logits, answer_type_logits)
Example #16
0
def create_mask_model(bert_config, is_training, input_ids, input_mask,
                      segment_ids, mask_positions, use_one_hot_embeddings):
    """Creates a classification model."""

    #print("create mask model ----------------------------------------------")
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # Get the logits for the start and end predictions.
    final_hidden = model.get_sequence_output()

    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    output_weights = tf.get_variable(
        "cls/nq/output_weights", [2, hidden_size + 12],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("cls/nq/output_bias", [2],
                                  initializer=tf.zeros_initializer())

    final_hidden_matrix = tf.reshape(final_hidden,
                                     [batch_size * seq_length, hidden_size])
    mask_positions_matrix = tf.cast(tf.reshape(mask_positions,
                                               [batch_size * seq_length, 1]),
                                    dtype=tf.float32)
    padding = tf.zeros([batch_size * seq_length, 11], dtype=tf.float32)
    mask_positions_matrix = tf.concat([mask_positions_matrix, padding],
                                      axis=-1)
    final_hidden_matrix = tf.concat(
        [final_hidden_matrix, mask_positions_matrix], axis=-1)
    final_hidden_matrix = tf.reshape(
        final_hidden_matrix, [batch_size, seq_length, hidden_size + 12])
    attention_mask = modeling.create_attention_mask_from_input_mask(
        input_ids, input_mask)
    config = bert_config
    all_encoder_layers = modeling.transformer_model(
        input_tensor=final_hidden_matrix,
        attention_mask=attention_mask,
        hidden_size=config.hidden_size + 12,  # input hidden size
        num_hidden_layers=1,  #config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        intermediate_act_fn=modeling.get_activation(config.hidden_act),
        hidden_dropout_prob=config.hidden_dropout_prob,
        attention_probs_dropout_prob=config.attention_probs_dropout_prob,
        initializer_range=config.initializer_range,
        do_return_all_layers=True)
    #print(all_encoder_layers.shape)
    transformer_output_matrix = all_encoder_layers[-1]

    transformer_output_matrix = tf.reshape(
        transformer_output_matrix, [batch_size * seq_length, hidden_size + 12])
    logits = tf.matmul(transformer_output_matrix,
                       output_weights,
                       transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    logits = tf.reshape(logits, [batch_size, seq_length, 2])
    logits = tf.transpose(logits, [2, 0, 1])

    unstacked_logits = tf.unstack(logits, axis=0)

    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    # Get the logits for the answer type prediction.
    answer_type_output_layer = model.get_pooled_output()
    answer_type_hidden_size = answer_type_output_layer.shape[-1].value

    num_answer_types = 5  # YES, NO, UNKNOWN, SHORT, LONG
    answer_type_output_weights = tf.get_variable(
        "answer_type_output_weights",
        [num_answer_types, answer_type_hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    answer_type_output_bias = tf.get_variable(
        "answer_type_output_bias", [num_answer_types],
        initializer=tf.zeros_initializer())

    answer_type_logits = tf.matmul(answer_type_output_layer,
                                   answer_type_output_weights,
                                   transpose_b=True)
    answer_type_logits = tf.nn.bias_add(answer_type_logits,
                                        answer_type_output_bias)

    return (start_logits, end_logits, answer_type_logits)
Example #17
0
def create_contextualized_cnn_model(is_training,
                                    token_embeddings,
                                    config,
                                    batch_size,
                                    segment_ids=None,
                                    name="deconv_model"):
    """
    Impossible to train. I was getting examples/sec: 0.152973
    May need to take a closer look here and figure out if something is wrong
    or perhaps even implement this ourselves.
    https://arxiv.org/pdf/1603.07285.pdf
    """
    print('batch_size: %d' % batch_size)

    input_shape = get_shape_list(token_embeddings, expected_rank=3)
    # batch_size = 32  # NOTE: must do this for prediction input_shape[0]
    seq_length = input_shape[1]

    channels_in = 1

    downsized_input = token_embeddings
    if len(config.cnn_downsize.filter_shapes) > 0:
        downsized_input = apply_conv_layers(
            is_training,
            token_embeddings,
            config.cnn_downsize,
            dropout_rate=0.,
            name='contextualized_cnn/downsizer',
        )

    # This does not work during prediction
    # assert downsized_input.shape[0].value == batch_size
    assert downsized_input.shape[1].value == seq_length
    downsized_channels_out = downsized_input.shape[-1].value

    paragraphs = downsized_input
    if config.mask_questions:
        paragraphs = mask_questions_batch(downsized_input, segment_ids,
                                          downsized_channels_out)

    filters = apply_conv_layers(is_training,
                                downsized_input,
                                config.filter_generator,
                                name='contextualized_cnn/filter_generator',
                                dropout_rate=0.)

    # This does not work during prediction
    # assert filters.shape[0].value == batch_size
    assert filters.shape[1].value == seq_length
    n_filters = int(filters.shape[2].value / downsized_channels_out)
    assert (n_filters % 2) == 0

    filter_generator_pooling = config.filter_generator_pooling
    pooling_size = [filter_generator_pooling['size'], 1, 1]
    pooling_strides = [filter_generator_pooling['strides'], 1, 1]
    if config.filter_generator_pooling['type'] == 'max':
        filters = tf.nn.max_pool1d(filters, pooling_size, pooling_strides,
                                   'VALID')
    elif config.filter_generator_pooling['type'] == 'avg':
        filters = tf.nn.avg_pool1d(filters, pooling_size, pooling_strides,
                                   'VALID')

    contextualized = apply_per_sample_conv1d(paragraphs, filters, n_filters,
                                             batch_size)

    (start_features, end_features) = tf.split(contextualized, 2, axis=2)

    feature_channels_out = int(n_filters / 2.)

    def compute_logits(features):
        wd1 = tf.Variable(tf.truncated_normal([feature_channels_out, 1],
                                              stddev=0.03),
                          name='wd1')
        bd1 = tf.Variable(tf.truncated_normal([1], stddev=0.01), name='bd1')

        logits = tf.matmul(features, wd1)
        logits = tf.nn.bias_add(logits, bd1)

        logits = tf.reshape(logits, [batch_size, seq_length])

        return logits

    start_logits = compute_logits(start_features)
    end_logits = compute_logits(end_features)

    return (start_logits, end_logits)
    def _create_model(self, mode, input_ids, input_mask, segment_ids, labels,
                      slot_labels, labels_mask, drop_keep_prob,
                      entity_type_ids, sequence_lengths):
        """Creates a LaserTagger model."""
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        model = modeling.BertModel(
            config=self._config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=self._use_one_hot_embeddings)

        final_layer = model.get_sequence_output()
        # final_hidden = model.get_pooled_output()

        if is_training:
            # I.e., 0.1 dropout
            # final_hidden = tf.nn.dropout(final_hidden, keep_prob=drop_keep_prob)
            final_layer = tf.nn.dropout(final_layer, keep_prob=drop_keep_prob)

        # 结合实体信息
        batch_size, seq_length = modeling.get_shape_list(input_ids)

        self.entity_type_embedding = tf.get_variable(
            name="entity_type_embedding",
            shape=(self.entity_type_num, self._config.hidden_size),
            dtype=tf.float32,
            trainable=True,
            initializer=tf.random_uniform_initializer(
                -self._config.initializer_range * 100,
                self._config.initializer_range * 100,
                seed=20))

        with tf.init_scope():
            impact_weight_init = tf.constant(1.0 / self.entity_type_num,
                                             dtype=tf.float32,
                                             shape=(1, self.entity_type_num))
        self.impact_weight = tf.Variable(impact_weight_init,
                                         dtype=tf.float32,
                                         name="impact_weight")  # 不同类型的影响权重
        impact_weight_matrix = tf.tile(self.impact_weight,
                                       multiples=[batch_size * seq_length, 1])

        entity_type_ids_matrix1 = tf.cast(tf.reshape(
            entity_type_ids, [batch_size * seq_length, self.entity_type_num]),
                                          dtype=tf.float32)
        entity_type_ids_matrix = tf.multiply(entity_type_ids_matrix1,
                                             impact_weight_matrix)
        entity_type_emb = tf.matmul(entity_type_ids_matrix,
                                    self.entity_type_embedding)
        final_layer = final_layer + tf.reshape(entity_type_emb, [
            batch_size, seq_length, self._config.hidden_size
        ])  # TODO TODO    # 0.7071067811865476是二分之根号二
        # final_layer = tf.concat([final_layer, tf.reshape(entity_type_emb, [batch_size, seq_length,self._config.hidden_size])], axis=-1)

        if is_training:
            final_layer = tf.nn.dropout(final_layer, keep_prob=drop_keep_prob)

        (output_fw_seq,
         output_bw_seq), ((c_fw, h_fw),
                          (c_bw, h_bw)) = tf.nn.bidirectional_dynamic_rnn(
                              cell_fw=LSTMCell(self.lstm_hidden_size),
                              cell_bw=LSTMCell(self.lstm_hidden_size),
                              inputs=final_layer,
                              sequence_length=sequence_lengths,
                              dtype=tf.float32)
        layer_matrix = tf.concat([output_fw_seq, output_bw_seq], axis=-1)
        final_hidden = tf.concat([c_fw, c_bw], axis=-1)

        layer_matrix = tf.contrib.layers.layer_norm(inputs=layer_matrix,
                                                    begin_norm_axis=-1,
                                                    begin_params_axis=-1)

        intent_logits = tf.layers.dense(
            final_hidden,
            self._num_tags,
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
            name="output_projection")
        slot_logits = tf.layers.dense(
            layer_matrix,
            self.num_slot_tags,
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
            name="slot_projection")

        with tf.variable_scope("loss"):
            loss = None
            per_example_intent_loss = None
            per_example_slot_loss = None
            if mode != tf.estimator.ModeKeys.PREDICT:
                per_example_intent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels, logits=intent_logits)
                slot_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=slot_labels, logits=slot_logits)
                per_example_slot_loss = tf.truediv(
                    tf.reduce_sum(slot_loss, axis=1),
                    tf.cast(tf.reduce_sum(labels_mask, axis=1), tf.float32))

                # from tensorflow.contrib.crf import crf_log_likelihood
                # from tensorflow.contrib.crf import viterbi_decode
                # batch_size = tf.shape(slot_logits)[0]
                # print(curLine(), batch_size, tf.constant([self._max_seq_length]))
                # length_batch = tf.tile(tf.constant([self._max_seq_length]), [batch_size])
                # print(curLine(), batch_size, "length_batch:", length_batch)
                # per_example_slot_loss, self.transition_params = crf_log_likelihood(inputs=slot_logits,
                #                 tag_indices=slot_labels,sequence_lengths=length_batch)
                # print(curLine(), "per_example_slot_loss:", per_example_slot_loss) # shape=(batch_size,)
                # print(curLine(), "self.transition_params:", self.transition_params) # shape=(9, 9)

                loss = tf.reduce_mean(self.intent_ratio *
                                      per_example_intent_loss +
                                      self.slot_ratio * per_example_slot_loss)
            pred_intent = tf.cast(tf.argmax(intent_logits, axis=-1), tf.int32)
            pred_slot = tf.cast(tf.argmax(slot_logits, axis=-1), tf.int32)
            return (loss, per_example_slot_loss, pred_intent, pred_slot,
                    batch_size, entity_type_emb, impact_weight_matrix,
                    entity_type_ids_matrix, final_layer, slot_logits)
Example #19
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
    """Creates a classification model."""
    model = TFXLMRobertaModel.from_pretrained('xlm_model', from_pt=True)
    #for i in model.trainable_variables:
    #print(i.name)

    outputs = model(inputs=input_ids,
                    attention_mask=input_mask,
                    token_type_ids=segment_ids,
                    training=is_training)

    final_hidden = outputs[0]

    # Get the logits for the start and end predictions.
    #final_hidden = model.get_sequence_output()

    final_hidden_shape = bert_modeling.get_shape_list(final_hidden,
                                                      expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    output_weights = tf.get_variable(
        "cls/tydi/output_weights", [2, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("cls/tydi/output_bias", [2],
                                  initializer=tf.zeros_initializer())

    final_hidden_matrix = tf.reshape(final_hidden,
                                     [batch_size * seq_length, hidden_size])
    logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    logits = tf.reshape(logits, [batch_size, seq_length, 2])
    logits = tf.transpose(logits, [2, 0, 1])

    unstacked_logits = tf.unstack(logits, axis=0)

    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    # Get the logits for the answer type prediction.
    #answer_type_output_layer = model.get_pooled_output()
    answer_type_output_layer = outputs[1]
    answer_type_hidden_size = answer_type_output_layer.shape[-1]

    num_answer_types = 5  # YES, NO, UNKNOWN, PASSAGE, MINIMAL
    answer_type_output_weights = tf.get_variable(
        "answer_type_output_weights",
        [num_answer_types, answer_type_hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    answer_type_output_bias = tf.get_variable(
        "answer_type_output_bias", [num_answer_types],
        initializer=tf.zeros_initializer())

    answer_type_logits = tf.matmul(answer_type_output_layer,
                                   answer_type_output_weights,
                                   transpose_b=True)
    answer_type_logits = tf.nn.bias_add(answer_type_logits,
                                        answer_type_output_bias)

    return start_logits, end_logits, answer_type_logits
Example #20
0
  def build_attn_layer(self,
                       input_tensor,
                       attn_mask_concat,
                       layer_attn_type,
                       num_attention_heads=1,
                       size_per_head=512,
                       attention_probs_dropout_prob=0.1,
                       initializer_range=0.02,
                       do_return_2d_tensor=False):
    # TODO (May 5): To capture each softmax output, will need a modified
    #  `attention_layer`
    input_tensor_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_tensor_shape[0]
    total_seq_length = input_tensor_shape[1]
    arg_seq_length = int(total_seq_length / 2)

    attention_head = None
    if layer_attn_type == "self":
      attn_mask = modeling.create_attention_mask_from_input_mask(
        input_tensor, attn_mask_concat)
      attention_head = modeling.attention_layer(
        from_tensor=input_tensor,
        to_tensor=input_tensor,
        attention_mask=attn_mask,
        num_attention_heads=num_attention_heads,
        size_per_head=size_per_head,
        attention_probs_dropout_prob=attention_probs_dropout_prob,
        initializer_range=initializer_range,
        do_return_2d_tensor=do_return_2d_tensor,
        batch_size=batch_size,
        from_seq_length=total_seq_length,
        to_seq_length=total_seq_length
      )

    else:
      arg1 = input_tensor[:, :arg_seq_length, :]
      arg2 = input_tensor[:, arg_seq_length:, :]

      arg1_attn_mask = attn_mask_concat[:, :arg_seq_length]
      arg1_attn_mask = modeling.create_attention_mask_from_input_mask(
        arg1, arg1_attn_mask)

      arg2_attn_mask = attn_mask_concat[:, arg_seq_length:]
      arg2_attn_mask = modeling.create_attention_mask_from_input_mask(
        arg2, arg2_attn_mask)

      if layer_attn_type == "inter":
        with tf.variable_scope("arg1_arg2"):
          arg1_to_arg2 = modeling.attention_layer(
            from_tensor=arg1,
            to_tensor=arg2,
            attention_mask=arg2_attn_mask,
            num_attention_heads=num_attention_heads,
            size_per_head=size_per_head,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=do_return_2d_tensor,
            batch_size=batch_size,
            from_seq_length=arg_seq_length,
            to_seq_length=arg_seq_length
          )

        with tf.variable_scope("arg2_arg1"):
          arg2_to_arg1 = modeling.attention_layer(
            from_tensor=arg2,
            to_tensor=arg1,
            attention_mask=arg1_attn_mask,
            num_attention_heads=num_attention_heads,
            size_per_head=size_per_head,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=do_return_2d_tensor,
            batch_size=batch_size,
            from_seq_length=arg_seq_length,
            to_seq_length=arg_seq_length
          )

        attention_head = tf.concat([arg1_to_arg2, arg2_to_arg1], axis=1)
      else:
        with tf.variable_scope("arg1_arg1"):
          arg1_to_arg1 = modeling.attention_layer(
            from_tensor=arg1,
            to_tensor=arg1,
            attention_mask=arg1_attn_mask,
            num_attention_heads=num_attention_heads,
            size_per_head=size_per_head,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=do_return_2d_tensor,
            batch_size=batch_size,
            from_seq_length=arg_seq_length,
            to_seq_length=arg_seq_length
          )

        with tf.variable_scope("arg2_arg2"):
          arg2_to_arg2 = modeling.attention_layer(
            from_tensor=arg2,
            to_tensor=arg2,
            attention_mask=arg2_attn_mask,
            num_attention_heads=num_attention_heads,
            size_per_head=size_per_head,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=do_return_2d_tensor,
            batch_size=batch_size,
            from_seq_length=arg_seq_length,
            to_seq_length=arg_seq_length
          )

        attention_head = tf.concat([arg1_to_arg1, arg2_to_arg2], axis=1)

    return attention_head
def create_original_varmisuse_model(
    bert_config,
    is_training,
    enable_sequence_masking,
    input_ids,
    input_mask,
    segment_ids,
    candidate_mask,
    target_mask,
    error_location_mask,
    use_one_hot_embeddings,
    multi_head_count = 2,
):
  """Creates a two-headed pointer model."""

  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  final_sequence = model.get_sequence_output()

  final_sequence_shape = modeling.get_shape_list(final_sequence,
                                                 expected_rank=3)
  batch_size, sequence_length, hidden_size = final_sequence_shape

  cls_output = model.get_pooled_output()

  # Calculate pointer probabilities as the attention vector over program tokens.
  # Pointer network equations:
  # (1) M = tanh(Y * Wy_extend + h_extend * Wh_extend)
  # (2) multi_headed_alpha = softmax(M * w_extend)
  # Vector shapes:
  #  (1) M:     [batch_size, sequence_length, hidden_size]
  #  (2) Wy:    [hidden_size, hidden_size]
  #  (3) Wh:    [hidden_size, hidden_size]
  #  (4) h:     [batch_size, hidden_size]
  #  (5) Y:     [batch_size, sequence_length, hidden_size]
  #  (6) w:     [hidden_size, multi_head_count]
  #  (7) multi_headed_alpha: [batch_size, sequence_length, multi_head_count]
  #  (8) Wy_extend: Wy extended to [batch_size, hidden_size, hidden_size]
  #  (9) Wh_extend: Wh extended to [batch_size, hidden_size, hidden_size]
  # (10) h_extend: h extended to [batch_size, sequence_length, hidden_size]
  # (11) w_extend: w extended to [batch_size, hidden_size, multi_head_count]

  wy = tf.get_variable(
      "Wy",
      shape=[hidden_size, hidden_size],
      dtype=tf.float32,
      initializer=contrib.layers.xavier_initializer())
  wh = tf.get_variable(
      "Wh",
      shape=[hidden_size, hidden_size],
      dtype=tf.float32,
      initializer=contrib.layers.xavier_initializer())
  w = tf.get_variable(
      "w",
      shape=[hidden_size, multi_head_count],
      dtype=tf.float32,
      initializer=contrib.layers.xavier_initializer())

  # Dimensions: [batch_size, hidden_size, hidden_size]
  wy_extend = tf.tile(tf.expand_dims(wy, 0), [batch_size, 1, 1])
  # Dimensions: [batch_size, hidden_size, hidden_size]
  wh_extend = tf.tile(tf.expand_dims(wh, 0), [batch_size, 1, 1])
  # Dimensions: [batch_size, sequence_length, hidden_size]
  cls_output_extend = tf.tile(
      tf.expand_dims(cls_output, 1), [1, sequence_length, 1])

  candidate_mask_expanded = tf.expand_dims(candidate_mask, 2)
  if enable_sequence_masking:
    # Mask sequence using `candidate_mask`.
    candidates_mask_extend = tf.tile(candidate_mask_expanded,
                                     [1, 1, hidden_size])
    final_sequence_masked = tf.multiply(final_sequence,
                                        tf.to_float(candidates_mask_extend))
    m = tf.tanh(
        tf.matmul(final_sequence_masked, wy_extend) +
        tf.matmul(cls_output_extend, wh_extend))
  else:
    m = tf.tanh(
        tf.matmul(final_sequence, wy_extend) +
        tf.matmul(cls_output_extend, wh_extend))

  # Dimension: [batch_size, hidden_size, multi_head_count]
  w_extend = tf.tile(tf.expand_dims(w, 0), [batch_size, 1, 1])

  # Dimension: [batch_size, sequence_length, multi_head_count]
  logits = tf.matmul(m, w_extend)

  # Dimension: [batch_size, sequence_length, multi_head_count]
  candidates_mask_extend_to_heads = tf.tile(candidate_mask_expanded,
                                            [1, 1, multi_head_count])

  # Mask logits using `candidate_mask`.
  logits_masked = tf.multiply(
      logits, tf.to_float(candidates_mask_extend_to_heads))
  probabilities = tf.nn.softmax(logits_masked, axis=1)

  location_probabilities, repair_probabilities = tf.unstack(
      probabilities, axis=2)

  def compute_loss(labels, probabilities):
    return -tf.reduce_sum(
        tf.multiply(tf.to_float(labels),
                    tf.log(tf.clip_by_value(probabilities, 1e-10, 1.0))),
        axis=1)

  localization_loss = compute_loss(error_location_mask,
                                   location_probabilities)
  repair_loss = compute_loss(target_mask, repair_probabilities)

  per_example_loss = localization_loss + repair_loss

  loss = tf.reduce_mean(per_example_loss)

  return loss, per_example_loss, logits_masked, probabilities
Example #22
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 image_embeddings,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=False,
                 scope=None):
        """Constructor for a visually grounded BertModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      image_embeddings: float32 Tensor of shape [batch_size, seq_length, depth].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.embedding_lookup() for the word embeddings.
      scope: (optional) variable scope. Defaults to "bert".
    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        text_input_shape = modeling.get_shape_list(input_ids, expected_rank=2)
        batch_size = text_input_shape[0]
        text_seq_length = text_input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, text_seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, text_seq_length],
                                      dtype=tf.int32)

        with tf.variable_scope(scope, default_name="bert"):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (self.embedding_output,
                 self.embedding_table) = modeling.embedding_lookup(
                     input_ids=input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

                # Add image embeddings the rest of the input embeddings.
                self.embedding_output += tf.layers.dense(
                    image_embeddings,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    self.embedding_output, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))
Example #23
0
    def __init__(
            self,
            bert_config,
            char_config,
            is_training,  # is_evaluation,
            input_token_ids,
            input_char_ids,
            labels,
            num_labels,
            use_char_representation=True,
            input_mask=None,
            segment_ids=None,
            use_one_hot_embeddings=False,  # TPU加速则为True
            scope=None):
        """

        :param bert_config:
        :param char_config:
        :param is_training: 处于estimator模式下的train模式
        :param is_evaluation: 处于estimator模式下的evaluate模式
        :param input_token_ids:
        :param input_char_ids:
        :param labels: 真实标签
        :param num_labels: 标签个数,用于CRF的转移矩阵
        :param input_mask:
        :param segment_ids: 用于Bert,不过这里没啥用处,因为只是处理一个ner的问题,所以bert默认都为0
        :param use_one_hot_embeddings: 是否用tpu
        :param scope:
        """
        self.bert_model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=input_token_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)
        self.token_output = self.bert_model.get_sequence_output()

        if use_char_representation:
            char_embed_dim = char_config['char_embed_dim']
            filters = char_config['filters']
            alphabet_size = char_config['alphabet_size']
            activations = char_config['activations']
            n_highway = char_config['n_highway']
            projection_dim = char_config['projection_dim']
            char_dropout_rate = char_config[
                'char_dropout_rate'] if is_training else 1.0

            self.charcnn_model = CharRepresentation(
                char_input=input_char_ids,
                alphabet_size=alphabet_size,
                filters=filters,
                projection_dim=projection_dim,
                char_embed_dim=char_embed_dim,
                activations=activations,
                n_highway=n_highway,
                dropout_rate=char_dropout_rate)
            self.char_output = self.charcnn_model.get_highway_output()

            token_shape = modeling.get_shape_list(self.token_output,
                                                  expected_rank=3)
            char_shape = modeling.get_shape_list(self.char_output,
                                                 expected_rank=3)

            if token_shape[1] != char_shape[1]:
                raise ValueError(
                    "The time steps of token representation (%d) is not the same as char representation (%d) "
                    % (token_shape[1], char_shape[1]))

            self.final_output = tf.concat(
                [self.token_output, self.char_output], axis=-1)
        else:
            tf.logging.info(
                "****************BERT representation only***************")
            self.final_output = self.token_output

        sequece_lengths = tf.reduce_sum(input_mask, axis=-1)
        self.crf = CRF(
            input=self.final_output,
            labels=labels,
            num_labels=num_labels,
            lengths=sequece_lengths,
            is_training=is_training,
            # is_evaluation=is_evaluation  # estimator模式下的evaluate模式还是需要返回损失函数的
        )
Example #24
0
        for feature in features:
            feature.unique_id = feature.example_index + feature.context_index + feature.doc_span_index
            output_fn(feature)

    return num_spans_to_ids


def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings):
	"""Creates a classification model to classify position of start and end token"""
    model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids,
                               input_mask=input_mask, token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)
    final_hidden = model.get_sequence_output()

    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    output_weights = tf.get_variable("cls/emr/output_weights", [2, hidden_size],
                                     initializer=tf.truncated_normal_initializer(stddev=0.02))
    output_bias = tf.get_variable("cls/emr/output_bias", [2], initializer=tf.zeros_initializer()) 

    # to be able to use checkpoints from SQUAD trained BERT QA model, replace 'emr' by 'squad' in names of above variables

    final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size])

    logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
Example #25
0
  def encode_concat_context(self,
                            input_tensor,
                            segment_ids,
                            segment_vocab_size=16,
                            max_position_embeddings=512,
                            hidden_dropout_prob=0.1,
                            initializer_range=0.02,
                            use_segment_ids=False,
                            use_position_embedding=False):
    """See `embedding_postprocessor` defined in `bert/modeling.py`"""
    input_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_segment_ids:
      segment_table = tf.get_variable(
        name="segment_embeddings",
        shape=[segment_vocab_size, width],
        initializer=modeling.create_initializer(initializer_range))

      flat_segment_ids = tf.reshape(segment_ids, [-1]) # flatten
      one_hot_ids = tf.one_hot(flat_segment_ids, depth=segment_vocab_size)
      segment_embeddings = tf.matmul(one_hot_ids, segment_table)
      segment_embeddings = tf.reshape(segment_embeddings,
                                      [batch_size, seq_length, width])
      output += segment_embeddings

    if use_position_embedding:
      position_embeddings = tf.get_variable(
        name="position_embeddings",
        shape=[max_position_embeddings, width],
        initializer=modeling.create_initializer(initializer_range))

      # Since the position embedding table is a learned variable, we create it
      # using a (long) sequence length `max_position_embeddings`. The actual
      # sequence length might be shorter than this, for faster training of
      # tasks that do not have long sequences.
      #
      # So `full_position_embeddings` is effectively an embedding table
      # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
      # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
      # perform a slice.
      position_embeddings = tf.slice(position_embeddings, [0, 0],
                                     [seq_length, -1])
      num_dims = len(output.shape.as_list())

      # Only the last two dimensions are relevant (`seq_length` and `width`), so
      # we broadcast among the first dimensions, which is typically just
      # the batch size.
      position_broadcast_shape = []
      for _ in range(num_dims - 2):
        position_broadcast_shape.append(1)
      position_broadcast_shape.extend([seq_length, width])
      position_embeddings = tf.reshape(position_embeddings,
                                       position_broadcast_shape)
      output += position_embeddings

    output = modeling.layer_norm_and_dropout(output, hidden_dropout_prob)
    return output
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 num_labels, use_one_hot_embeddings, membership_features_str):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=False,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    output_layer = model.get_pooled_output()
    final_hidden = model.get_sequence_output()
    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

    hidden_stacked = tf.reshape(final_hidden,
                                [batch_size, seq_length * hidden_size])

    # depending on flags, choose the input feature set for classification
    if membership_features_str == "last_plus_logits":
        membership_features = tf.concat([hidden_stacked, output_layer, logits],
                                        axis=1)
    elif membership_features_str == "last":
        membership_features = hidden_stacked
    elif membership_features_str == "logits":
        membership_features = tf.concat([output_layer, logits], axis=1)

    num_membership_features = modeling.get_shape_list(membership_features,
                                                      expected_rank=2)[1]

    membership_weights = tf.get_variable(
        "cls/membership/weights", [2, num_membership_features],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    membership_bias = tf.get_variable("cls/membership/bias", [2],
                                      initializer=tf.zeros_initializer())

    membership_logits = tf.matmul(membership_features,
                                  membership_weights,
                                  transpose_b=True)
    membership_logits = tf.nn.bias_add(membership_logits, membership_bias)

    # return the weights since we only want to optimize them
    return membership_logits, [membership_weights, membership_bias]
    def model_fn(features, labels, mode, params, global_step):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""
        if tf.equal(global_step, 0):
            '''logging.info("*** Features ***")
        for name in sorted(features.keys()):
          logging.info("  name = %s, shape = %s", name, features[name].shape)'''

        unique_ids = features["unique_ids"]
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (start_logits, end_logits, answer_type_logits) = create_model(
            bert_config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        tvars = tf.trainable_variables()

        print('tvars', tvars)
        initialized_variable_names = {}
        scaffold_fn = None

        #only initialize graphs with checkpoint the first step in eager mode
        if tf.equal(global_step, 0):
            if init_checkpoint:
                (assignment_map, initialized_variable_names
                 ) = bert_modeling.get_assignment_map_from_checkpoint(
                     tvars, init_checkpoint)
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(init_checkpoint,
                                                      assignment_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)

            logging.info("**** Trainable Variables ****")
            for var in tvars:
                init_string = ""
                if var.name in initialized_variable_names:
                    init_string = ", *INIT_FROM_CKPT*"
                logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                             init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            seq_length = bert_modeling.get_shape_list(input_ids)[1]

            # Computes the loss for positions.
            def compute_loss(logits, positions):
                one_hot_positions = tf.one_hot(positions,
                                               depth=seq_length,
                                               dtype=tf.float32)
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                loss = -tf.reduce_mean(
                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
                return loss

            # Computes the loss for labels.
            def compute_label_loss(logits, labels):
                one_hot_labels = tf.one_hot(labels,
                                            depth=len(data.AnswerType),
                                            dtype=tf.float32)
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                loss = -tf.reduce_mean(
                    tf.reduce_sum(one_hot_labels * log_probs, axis=-1))
                return loss

            start_positions = features["start_positions"]
            end_positions = features["end_positions"]
            answer_types = features["answer_types"]

            start_loss = compute_loss(start_logits, start_positions)
            end_loss = compute_loss(end_logits, end_positions)

            answer_type_loss = compute_label_loss(answer_type_logits,
                                                  answer_types)

            total_loss = (start_loss + end_loss + answer_type_loss) / 3.0

            return tvars, total_loss
Example #28
0
  def build_attn_layers(self,
                        input_tensor,
                        attn_mask_concat,
                        intermediate_size=2048,
                        intermediate_act_fn=modeling.gelu,
                        hidden_dropout_prob=0.1,
                        attention_probs_dropout_prob=0.1,
                        initializer_range=0.02,
                        do_return_all_layers=False):
    """See `attention_layer` defined in `bert/modeling.py`"""
    if not self.is_training:
      hidden_dropout_prob = 0.0
      attention_probs_dropout_prob = 0.0

    # input tensor shape: [batch, arg_length, BERT_hidden_size]
    # for example, using default hparams vals: [64, 128, 768]
    attention_head_size = int(self.hidden_size / self.num_attention_heads)
    input_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    prev_output = input_tensor

    attention_type_split = self.attention_type.split("_")

    all_layer_outputs = []
    for layer_idx in range(self.num_hidden_layers):
      with tf.variable_scope(f"layer_{layer_idx}"):
        layer_input = prev_output

        if len(attention_type_split) == 3:
          indexer = layer_idx % 2
        else:  # len(attention_type_split) == 2:
          indexer = 0
        layer_attn_type = attention_type_split[indexer]

        tf.logging.info(
          f"{layer_attn_type.capitalize()} Attention at {layer_idx}th Layer")

        attention_heads = []
        with tf.variable_scope(f"{layer_attn_type}_attn"):
          attention_head = self.build_attn_layer(
            input_tensor=input_tensor,
            attn_mask_concat=attn_mask_concat,
            layer_attn_type=layer_attn_type,
            num_attention_heads=self.num_attention_heads,
            size_per_head=attention_head_size,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=False
          )

          attention_heads.append(attention_head)

          attention_output = None
          if len(attention_heads) == 1:
            attention_output = attention_heads[0]
          else:
            # In the case where we have other sequences, we just concatenate
            # them to the self-attention head before the projection.
            attention_output = tf.concat(attention_heads, axis=-1)

          # Run a linear projection of `hidden_size` then add a residual
          # with `layer_input`.
          with tf.variable_scope("output"):
            attention_output = tf.layers.dense(
              attention_output,
              self.hidden_size,
              kernel_initializer=modeling.create_initializer(initializer_range))
            attention_output = modeling.dropout(attention_output,
                                                hidden_dropout_prob)
            attention_output = modeling.layer_norm(attention_output + layer_input)

        # The activation is only applied to the "intermediate" hidden layer.
        with tf.variable_scope("intermediate"):
          intermediate_output = tf.layers.dense(
            attention_output,
            intermediate_size,
            activation=intermediate_act_fn,
            kernel_initializer=modeling.create_initializer(initializer_range))

        # Down-project back to `hidden_size` then add the residual.
        with tf.variable_scope("output"):
          layer_output = tf.layers.dense(
            intermediate_output,
            self.hidden_size,
            kernel_initializer=modeling.create_initializer(initializer_range))
          layer_output = modeling.dropout(layer_output, hidden_dropout_prob)
          layer_output = modeling.layer_norm(layer_output + attention_output)
          prev_output = layer_output
          all_layer_outputs.append(layer_output)

    if do_return_all_layers:
      final_outputs = []
      for layer_output in all_layer_outputs:
        final_output = modeling.reshape_from_matrix(layer_output, input_shape)
        final_outputs.append(final_output)
      return final_outputs
    else:
      final_output = modeling.reshape_from_matrix(prev_output, input_shape)
      return final_output
Example #29
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        logging.info("*** Features ***")
        for name in sorted(features.keys()):
            logging.info("  name = %s, shape = %s", name, features[name].shape)

        unique_ids = features["unique_ids"]
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (start_logits, end_logits, answer_type_logits) = create_model(
            bert_config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = bert_modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                         init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            seq_length = bert_modeling.get_shape_list(input_ids)[1]

            # Computes the loss for positions.
            def compute_loss(logits, positions):
                one_hot_positions = tf.one_hot(positions,
                                               depth=seq_length,
                                               dtype=tf.float32)
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                loss = -tf.reduce_mean(
                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
                return loss

            # Computes the loss for labels.
            def compute_label_loss(logits, labels):
                one_hot_labels = tf.one_hot(labels,
                                            depth=len(data.AnswerType),
                                            dtype=tf.float32)
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                loss = -tf.reduce_mean(
                    tf.reduce_sum(one_hot_labels * log_probs, axis=-1))
                return loss

            start_positions = features["start_positions"]
            end_positions = features["end_positions"]
            answer_types = features["answer_types"]

            start_loss = compute_loss(start_logits, start_positions)
            end_loss = compute_loss(end_logits, end_positions)

            answer_type_loss = compute_label_loss(answer_type_logits,
                                                  answer_types)

            total_loss = (start_loss + end_loss + answer_type_loss) / 3.0

            train_op = bert_optimization.create_optimizer(
                total_loss, learning_rate, num_train_steps, num_warmup_steps,
                use_tpu)

            output_spec = tf_contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {
                "unique_ids": unique_ids,
                "start_logits": start_logits,
                "end_logits": end_logits,
                "answer_type_logits": answer_type_logits,
            }
            output_spec = tf_contrib.tpu.TPUEstimatorSpec(
                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
        else:
            raise ValueError("Only TRAIN and PREDICT modes are supported: %s" %
                             (mode))

        return output_spec
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    unique_ids = features["unique_ids"]
    input_ids = features["input_ids"]
    segment_ids = features["segment_ids"]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    seq_length = modeling.get_shape_list(input_ids)[1]
    query_length = FLAGS.max_query_length
    batch_size = params["batch_size"]

    _, attention_mask = make_attention_mask(batch_size, query_length,
                                            seq_length)

    with tf.variable_scope("bert") as scope:
      word_logits = create_model(
          bert_config=bert_config,
          is_training=is_training,
          input_ids=input_ids,
          input_mask=attention_mask,
          segment_ids=segment_ids,
          use_one_hot_embeddings=use_one_hot_embeddings,
          scope=scope)

    if not is_training:
      with tf.variable_scope("bert", reuse=True) as scope:
        output_ids = input_ids
        word_id = tf.argmax(word_logits, axis=2, output_type=tf.int32)

        # This operation implements: output_ids[:, 2] = word_id[:, 0]
        word_id = tf.pad(word_id, [[0, 0], [2, seq_length - query_length]])
        output_ids = input_ids + word_id * tf.one_hot(
            2, seq_length, dtype=tf.int32)

        def body(i, ids):
          """A decoding step."""
          word_logits = create_model(
              bert_config=bert_config,
              is_training=is_training,
              input_ids=ids,
              input_mask=attention_mask,
              segment_ids=segment_ids,
              use_one_hot_embeddings=use_one_hot_embeddings,
              scope=scope)

          word_id = tf.argmax(word_logits, axis=2, output_type=tf.int32)

          # This operation implements: output_ids[:, 1 + i] = word_id[:, i - 1]
          word_id = tf.pad(word_id, [[0, 0], [2, seq_length - query_length]])
          return [
              i + 1,
              ids + word_id * tf.one_hot(i + 1, seq_length, dtype=tf.int32)
          ]

        i0 = tf.constant(2)
        c = lambda i, _: i < query_length - 1
        _, output_ids = tf.while_loop(c, body, loop_vars=[i0, output_ids])

    tvars = tf.trainable_variables()

    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:
      # Computes the loss for word prediction.
      loss = tf.losses.sparse_softmax_cross_entropy(
          input_ids[:, 2:query_length],
          word_logits,
          reduction=tf.losses.Reduction.MEAN)

      train_op = optimization.create_optimizer(loss, learning_rate,
                                               num_train_steps,
                                               num_warmup_steps, use_tpu)

      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode, loss=loss, train_op=train_op, scaffold_fn=scaffold_fn)

    elif mode == tf.estimator.ModeKeys.PREDICT:
      predictions = {
          "unique_ids": tf.identity(unique_ids),
          "input_ids": output_ids,
          "segment_ids": tf.minimum(segment_ids, 1),
          "input_mask": tf.to_int32(tf.not_equal(output_ids, 0)),
          "start_positions": tf.identity(features["start_positions"]),
          "end_positions": tf.identity(features["end_positions"]),
          "answer_types": tf.identity(features["answer_types"])
      }
      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
    else:
      raise ValueError("Only TRAIN and PREDICT modes are supported: %s" %
                       (mode))

    return output_spec