def create_lm_model(self, vocab_size, sequence_length, hidden_size, num_predictions, output="predictions"): # First, create a transformer stack that we can use to get the LM's # vocabulary weight. xformer_stack = networks.TransformerEncoder( vocab_size=vocab_size, num_layers=1, sequence_length=sequence_length, hidden_size=hidden_size, num_attention_heads=4, ) word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) lm_outputs, _ = xformer_stack([word_ids, mask, type_ids]) # Create a maskedLM from the transformer stack. test_network = networks.MaskedLM( num_predictions=num_predictions, input_width=lm_outputs.shape[-1], source_network=xformer_stack, output=output) # Create a model from the masked LM layer. lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size)) masked_lm_positions = tf.keras.Input( shape=(num_predictions,), dtype=tf.int32) output = test_network([lm_input_tensor, masked_lm_positions]) return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
def __init__( self, num_masked_tokens: int, encoder_network: tf.keras.Model, mlm_initializer='glorot_uniform', classification_heads: Optional[List[tf.keras.layers.Layer]] = None, name: str = 'bert', **kwargs): self._self_setattr_tracking = False self._config = { 'encoder_network': encoder_network, 'num_masked_tokens': num_masked_tokens, 'mlm_initializer': mlm_initializer, 'classification_heads': classification_heads, 'name': name, } self.encoder_network = encoder_network inputs = copy.copy(self.encoder_network.inputs) sequence_output, _ = self.encoder_network(inputs) self.classification_heads = classification_heads or [] if len(set([cls.name for cls in self.classification_heads])) != len( self.classification_heads): raise ValueError('Classification heads should have unique names.') outputs = dict() if num_masked_tokens > 0: self.masked_lm = networks.MaskedLM( num_predictions=num_masked_tokens, input_width=sequence_output.shape[-1], source_network=self.encoder_network, initializer=mlm_initializer, name='masked_lm') masked_lm_positions = copy.copy(self.masked_lm.inputs[-1]) inputs.append(masked_lm_positions) outputs['lm_output'] = self.masked_lm( [sequence_output, masked_lm_positions]) for cls_head in self.classification_heads: outputs[cls_head.name] = cls_head(sequence_output) super(BertPretrainerV2, self).__init__(inputs=inputs, outputs=outputs, name=name, **kwargs)
def __init__(self, network, num_classes, num_token_predictions, float_type, activation=None, output_activation=None, initializer='glorot_uniform', output='logits', **kwargs): self._self_setattr_tracking = False self._config = { 'network': network, 'num_classes': num_classes, 'num_token_predictions': num_token_predictions, 'activation': activation, 'output_activation': output_activation, 'initializer': initializer, 'output': output, } # We want to use the inputs of the passed network as the inputs to this # Model. To do this, we need to keep a copy of the network inputs for use # when we construct the Model object at the end of init. (We keep a copy # because we'll be adding another tensor to the copy later.) network_inputs = network.inputs inputs = copy.copy(network_inputs) # Because we have a copy of inputs to create this Model object, we can # invoke the Network object with its own input tensors to start the Model. # Note that, because of how deferred construction happens, we can't use # the copy of the list here - by the time the network is invoked, the list # object contains the additional input added below. sequence_output, cls_output = network(network_inputs) sequence_output_length = sequence_output.shape.as_list()[1] if sequence_output_length < num_token_predictions: raise ValueError( "The passed network's output length is %s, which is less than the " 'requested num_token_predictions %s.' % (sequence_output_length, num_token_predictions)) masked_lm_positions = tf.keras.layers.Input( shape=(num_token_predictions,), name='masked_lm_positions', dtype=tf.int32) inputs.append(masked_lm_positions) self.masked_lm = networks.MaskedLM( num_predictions=num_token_predictions, input_width=sequence_output.shape[-1], source_network=network, float_type=float_type, activation=activation, initializer=initializer, output=output, name='masked_lm') lm_outputs = self.masked_lm([sequence_output, masked_lm_positions]) self.classification = networks.Classification( input_width=cls_output.shape[-1], num_classes=num_classes, initializer=initializer, output=output, name='classification') sentence_outputs = self.classification(cls_output) super(BertPretrainer, self).__init__( inputs=inputs, outputs=[lm_outputs, sentence_outputs], **kwargs)