def create_lm_model(self,
                      vocab_size,
                      sequence_length,
                      hidden_size,
                      num_predictions,
                      output="predictions"):
    # First, create a transformer stack that we can use to get the LM's
    # vocabulary weight.
    xformer_stack = networks.TransformerEncoder(
        vocab_size=vocab_size,
        num_layers=1,
        sequence_length=sequence_length,
        hidden_size=hidden_size,
        num_attention_heads=4,
    )
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    lm_outputs, _ = xformer_stack([word_ids, mask, type_ids])

    # Create a maskedLM from the transformer stack.
    test_network = networks.MaskedLM(
        num_predictions=num_predictions,
        input_width=lm_outputs.shape[-1],
        source_network=xformer_stack,
        output=output)

    # Create a model from the masked LM layer.
    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
    masked_lm_positions = tf.keras.Input(
        shape=(num_predictions,), dtype=tf.int32)
    output = test_network([lm_input_tensor, masked_lm_positions])
    return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
    def __init__(
            self,
            num_masked_tokens: int,
            encoder_network: tf.keras.Model,
            mlm_initializer='glorot_uniform',
            classification_heads: Optional[List[tf.keras.layers.Layer]] = None,
            name: str = 'bert',
            **kwargs):
        self._self_setattr_tracking = False
        self._config = {
            'encoder_network': encoder_network,
            'num_masked_tokens': num_masked_tokens,
            'mlm_initializer': mlm_initializer,
            'classification_heads': classification_heads,
            'name': name,
        }

        self.encoder_network = encoder_network
        inputs = copy.copy(self.encoder_network.inputs)
        sequence_output, _ = self.encoder_network(inputs)

        self.classification_heads = classification_heads or []
        if len(set([cls.name for cls in self.classification_heads])) != len(
                self.classification_heads):
            raise ValueError('Classification heads should have unique names.')

        outputs = dict()
        if num_masked_tokens > 0:
            self.masked_lm = networks.MaskedLM(
                num_predictions=num_masked_tokens,
                input_width=sequence_output.shape[-1],
                source_network=self.encoder_network,
                initializer=mlm_initializer,
                name='masked_lm')
            masked_lm_positions = copy.copy(self.masked_lm.inputs[-1])
            inputs.append(masked_lm_positions)
            outputs['lm_output'] = self.masked_lm(
                [sequence_output, masked_lm_positions])
        for cls_head in self.classification_heads:
            outputs[cls_head.name] = cls_head(sequence_output)

        super(BertPretrainerV2, self).__init__(inputs=inputs,
                                               outputs=outputs,
                                               name=name,
                                               **kwargs)
Exemple #3
0
  def __init__(self,
               network,
               num_classes,
               num_token_predictions,
               float_type,
               activation=None,
               output_activation=None,
               initializer='glorot_uniform',
               output='logits',
               **kwargs):
    self._self_setattr_tracking = False
    self._config = {
        'network': network,
        'num_classes': num_classes,
        'num_token_predictions': num_token_predictions,
        'activation': activation,
        'output_activation': output_activation,
        'initializer': initializer,
        'output': output,
    }

    # We want to use the inputs of the passed network as the inputs to this
    # Model. To do this, we need to keep a copy of the network inputs for use
    # when we construct the Model object at the end of init. (We keep a copy
    # because we'll be adding another tensor to the copy later.)
    network_inputs = network.inputs
    inputs = copy.copy(network_inputs)

    # Because we have a copy of inputs to create this Model object, we can
    # invoke the Network object with its own input tensors to start the Model.
    # Note that, because of how deferred construction happens, we can't use
    # the copy of the list here - by the time the network is invoked, the list
    # object contains the additional input added below.
    sequence_output, cls_output = network(network_inputs)

    sequence_output_length = sequence_output.shape.as_list()[1]
    if sequence_output_length < num_token_predictions:
      raise ValueError(
          "The passed network's output length is %s, which is less than the "
          'requested num_token_predictions %s.' %
          (sequence_output_length, num_token_predictions))

    masked_lm_positions = tf.keras.layers.Input(
        shape=(num_token_predictions,),
        name='masked_lm_positions',
        dtype=tf.int32)
    inputs.append(masked_lm_positions)

    self.masked_lm = networks.MaskedLM(
        num_predictions=num_token_predictions,
        input_width=sequence_output.shape[-1],
        source_network=network,
        float_type=float_type,
        activation=activation,
        initializer=initializer,
        output=output,
        name='masked_lm')
    lm_outputs = self.masked_lm([sequence_output, masked_lm_positions])

    self.classification = networks.Classification(
        input_width=cls_output.shape[-1],
        num_classes=num_classes,
        initializer=initializer,
        output=output,
        name='classification')
    sentence_outputs = self.classification(cls_output)

    super(BertPretrainer, self).__init__(
        inputs=inputs, outputs=[lm_outputs, sentence_outputs], **kwargs)