def test_inference_no_head_long(self): model = TFLongformerModel.from_pretrained( "allenai/longformer-base-4096") # 'Hello world! ' repeated 1000 times input_ids = tf.convert_to_tensor( [[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=tf.dtypes.int32) attention_mask = tf.ones(shape_list(input_ids), dtype=tf.dtypes.int32) global_attention_mask = tf.zeros(shape_list(input_ids), dtype=tf.dtypes.int32) # Set global attention on a few random positions global_attention_mask = tf.tensor_scatter_nd_update( global_attention_mask, tf.constant([[0, 1], [0, 4], [0, 21]]), tf.constant([1, 1, 1])) output = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)[0] expected_output_sum = tf.constant(74585.875) expected_output_mean = tf.constant(0.024267) # assert close tf.debugging.assert_near(tf.reduce_sum(output), expected_output_sum, rtol=1e-4) tf.debugging.assert_near(tf.reduce_mean(output), expected_output_mean, rtol=1e-4)
def test_diagonalize(self): hidden_states = self._get_hidden_states() hidden_states = tf.reshape( hidden_states, (1, 8, 4)) # set seq length = 8, hidden dim = 4 chunked_hidden_states = TFLongformerSelfAttention._chunk( hidden_states, window_overlap=2) window_overlap_size = shape_list(chunked_hidden_states)[2] self.assertTrue(window_overlap_size == 4) padded_hidden_states = TFLongformerSelfAttention._pad_and_diagonalize( chunked_hidden_states) self.assertTrue( shape_list(padded_hidden_states)[-1] == shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1) # first row => [0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000] tf.debugging.assert_near(padded_hidden_states[0, 0, 0, :4], chunked_hidden_states[0, 0, 0], rtol=1e-3) tf.debugging.assert_near(padded_hidden_states[0, 0, 0, 4:], tf.zeros((3, ), dtype=tf.dtypes.float32), rtol=1e-3) # last row => [0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629] tf.debugging.assert_near(padded_hidden_states[0, 0, -1, 3:], chunked_hidden_states[0, 0, -1], rtol=1e-3) tf.debugging.assert_near(padded_hidden_states[0, 0, -1, :3], tf.zeros((3, ), dtype=tf.dtypes.float32), rtol=1e-3)
def create_and_check_model_with_global_attention_mask( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.return_dict = True model = TFLongformerModel(config=config) half_input_mask_length = shape_list(input_mask)[-1] // 2 global_attention_mask = tf.concat( [ tf.zeros_like(input_mask)[:, :half_input_mask_length], tf.ones_like(input_mask)[:, half_input_mask_length:], ], axis=-1, ) result = model( input_ids, attention_mask=input_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, ) result = model(input_ids, token_type_ids=token_type_ids, global_attention_mask=global_attention_mask) result = model(input_ids, global_attention_mask=global_attention_mask) self.parent.assertListEqual( shape_list(result.last_hidden_state), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(shape_list(result.pooler_output), [self.batch_size, self.hidden_size])
def create_and_check_gpt2_model_attention_mask_past( self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFGPT2Model(config=config) # create attention mask half_seq_length = self.seq_length // 2 attn_mask_begin = tf.ones((self.batch_size, half_seq_length), dtype=tf.int32) attn_mask_end = tf.zeros( (self.batch_size, self.seq_length - half_seq_length), dtype=tf.int32) attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1) # first forward pass output, past = model(input_ids, attention_mask=attn_mask).to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) # change a random masked slice from input_ids random_seq_idx_to_change = ids_tensor( (1, ), half_seq_length).numpy() + 1 random_other_next_tokens = ids_tensor( (self.batch_size, self.seq_length), config.vocab_size) vector_condition = tf.range( self.seq_length) == (self.seq_length - random_seq_idx_to_change) condition = tf.transpose( tf.broadcast_to(tf.expand_dims(vector_condition, -1), (self.seq_length, self.batch_size))) input_ids = tf.where(condition, random_other_next_tokens, input_ids) # append to next input_ids and attn_mask next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) attn_mask = tf.concat([ attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32) ], axis=1) # get two different outputs output_from_no_past = model( next_input_ids, attention_mask=attn_mask)["last_hidden_state"] output_from_past = model(next_tokens, past=past, attention_mask=attn_mask)["last_hidden_state"] # select random slice random_slice_idx = int( ids_tensor((1, ), shape_list(output_from_past)[-1])) output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx] output_from_past_slice = output_from_past[:, 0, random_slice_idx] # test that outputs are equal for slice tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-12)
def create_and_check_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.return_dict = True model = TFLongformerModel(config=config) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) self.parent.assertListEqual( shape_list(result.last_hidden_state), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(shape_list(result.pooler_output), [self.batch_size, self.hidden_size])
def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFGPT2Model(config=config) # first forward pass outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids) outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False) self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) output, past = outputs.to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size) # append to next input_ids and token_type_ids next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1) output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"] # select random slice random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx] output_from_past_slice = output_from_past[:, 0, random_slice_idx] # test that outputs are equal for slice tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-6)
def call( self, hidden_states: tf.Tensor, head_mask: tf.Tensor, output_attentions: bool, relative_position_bias: Optional[ "TFData2VecVisionRelativePositionBias"] = None, training: bool = False, ) -> Tuple[tf.Tensor]: batch_size = shape_list(hidden_states)[0] mixed_query_layer = self.query(inputs=hidden_states) mixed_key_layer = self.key(inputs=hidden_states) mixed_value_layer = self.value(inputs=hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw attention scores. # (batch size, num_heads, seq_len_q, seq_len_k) attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) attention_scores = attention_scores / self.sqrt_att_head_size # Add relative position bias if present. if self.relative_position_bias is not None: # Passing `0.0` to the `relative_position_bias()` layer because otherwise Keras # might complain about `Layer.call()` not being invoked properly. In this case this input # i.e., 0.0 is not going to be used in any calculations so we're safe. attention_scores = attention_scores + self.relative_position_bias( 0.0)[None, ...] # Add shared relative position bias if provided. if relative_position_bias is not None: attention_scores = attention_scores + relative_position_bias # Normalize the attention scores to probabilities. attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(inputs=attention_probs, training=training) # Mask heads if we want to if head_mask is not None: attention_probs = tf.multiply(attention_probs, head_mask) attention_output = tf.matmul(attention_probs, value_layer) attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, all_head_size) attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) outputs = (attention_output, attention_probs) if output_attentions else ( attention_output, ) return outputs
def create_and_check_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.return_dict = True model = TFLongformerForQuestionAnswering(config=config) result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, start_positions=sequence_labels, end_positions=sequence_labels, ) self.parent.assertListEqual(shape_list(result.start_logits), [self.batch_size, self.seq_length]) self.parent.assertListEqual(shape_list(result.end_logits), [self.batch_size, self.seq_length])
def create_and_check_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_labels = self.num_labels model = TFLongformerForSequenceClassification(config=config) output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels).logits self.parent.assertListEqual(shape_list(output), [self.batch_size, self.num_labels])
def create_and_check_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.return_dict = True model = TFLongformerForMaskedLM(config=config) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual( shape_list(result.logits), [self.batch_size, self.seq_length, self.vocab_size])
def create_and_check_gpt2_model_past_large_inputs(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFGPT2Model(config=config) input_ids = input_ids[:1, :] input_mask = input_mask[:1, :] token_type_ids = token_type_ids[:1, :] self.batch_size = 1 # first forward pass outputs = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, use_cache=True) output, past = outputs.to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) next_attn_mask = ids_tensor((self.batch_size, 3), 2) next_token_types = ids_tensor((self.batch_size, 3), self.type_vocab_size) # append to next input_ids and token_type_ids next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) next_attention_mask = tf.concat([input_mask, next_attn_mask], axis=-1) next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1) output_from_no_past = model( next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask)["last_hidden_state"] output_from_past = model(next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past=past)["last_hidden_state"] self.parent.assertTrue( output_from_past.shape[1] == next_tokens.shape[1]) # select random slice random_slice_idx = int( ids_tensor((1, ), shape_list(output_from_past)[-1])) output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx] output_from_past_slice = output_from_past[:, :, random_slice_idx] # test that outputs are equal for slice tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
def test_pad_and_transpose_last_two_dims(self): hidden_states = self._get_hidden_states() self.assertTrue(shape_list(hidden_states), [1, 8, 4]) # pad along seq length dim paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]], dtype=tf.dtypes.int32) hidden_states = TFLongformerSelfAttention._chunk(hidden_states, window_overlap=2) padded_hidden_states = TFLongformerSelfAttention._pad_and_transpose_last_two_dims( hidden_states, paddings) self.assertTrue(shape_list(padded_hidden_states) == [1, 1, 8, 5]) expected_added_dim = tf.zeros((5, ), dtype=tf.dtypes.float32) tf.debugging.assert_near(expected_added_dim, padded_hidden_states[0, 0, -1, :], rtol=1e-6) tf.debugging.assert_near(hidden_states[0, 0, -1, :], tf.reshape(padded_hidden_states, (1, -1))[0, 24:32], rtol=1e-6)
def test_inference_no_head(self): model = TFLongformerModel.from_pretrained( "allenai/longformer-base-4096") # 'Hello world!' input_ids = tf.convert_to_tensor([[0, 20920, 232, 328, 1437, 2]], dtype=tf.dtypes.int32) attention_mask = tf.ones(shape_list(input_ids), dtype=tf.dtypes.int32) output = model(input_ids, attention_mask=attention_mask)[0] output_without_mask = model(input_ids)[0] expected_output_slice = tf.convert_to_tensor( [0.0549, 0.1087, -0.1119, -0.0368, 0.0250], dtype=tf.dtypes.float32) tf.debugging.assert_near(output[0, 0, -5:], expected_output_slice, rtol=1e-3) tf.debugging.assert_near(output_without_mask[0, 0, -5:], expected_output_slice, rtol=1e-3)
def test_chunk(self): hidden_states = self._get_hidden_states() batch_size = 1 seq_length = 8 hidden_size = 4 hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size)) chunked_hidden_states = TFLongformerSelfAttention._chunk( hidden_states, window_overlap=2) # expected slices across chunk and seq length dim expected_slice_along_seq_length = tf.convert_to_tensor( [0.4983, -0.7584, -1.6944], dtype=tf.dtypes.float32) expected_slice_along_chunk = tf.convert_to_tensor( [0.4983, -1.8348, -0.7584, 2.0514], dtype=tf.dtypes.float32) self.assertTrue(shape_list(chunked_hidden_states) == [1, 3, 4, 4]) tf.debugging.assert_near(chunked_hidden_states[0, :, 0, 0], expected_slice_along_seq_length, rtol=1e-3) tf.debugging.assert_near(chunked_hidden_states[0, 0, :, 0], expected_slice_along_chunk, rtol=1e-3)
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: batch_size, num_channels, height, width = shape_list(pixel_values) if getattr(height, "numpy", None) and getattr(width, "numpy", None): if height != self.image_size[0] or width != self.image_size[1]: raise ValueError( f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size[0]}*{self.image_size[1]}).") # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. # So change the input format from `NCHW` to `NHWC`. # shape = (batch_size, in_height, in_width, in_channels=num_channels) pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) projection = self.projection(pixel_values) # Change the 2D spatial dimensions to a single temporal dimension. # shape = (batch_size, num_patches, out_channels=embed_dim) num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0]) return tf.reshape(tensor=projection, shape=(batch_size, num_patches, -1))
def call(self, pixel_values: tf.Tensor, bool_masked_pos: Optional[tf.Tensor] = None) -> tf.Tensor: embeddings = self.patch_embeddings(pixel_values) batch_size, seq_len, projection_dim = shape_list(embeddings) cls_tokens = tf.tile(self.cls_token, (batch_size, 1, 1)) if bool_masked_pos is not None: mask_tokens = tf.broadcast_to( self.mask_token, (batch_size, seq_len, projection_dim)) # replace the masked visual tokens by mask_tokens w = bool_masked_pos[..., None] w = tf.cast(w, mask_tokens.dtype) # since TF doesn't support eager tensor assignment embeddings = embeddings * (1 - w) + mask_tokens * w embeddings = tf.concat([cls_tokens, embeddings], axis=1) if self.position_embeddings is not None: embeddings = embeddings + self.position_embeddings embeddings = self.dropout(embeddings) return embeddings