def get_framework(model, revision: Optional[str] = None): """ Select framework (TensorFlow or PyTorch) to use. Args: model (:obj:`str`, :class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`): If both frameworks are installed, picks the one corresponding to the model passed (either a model class or the model name). If no specific model is provided, defaults to using PyTorch. """ warnings.warn( "`get_framework` is deprecated and will be removed in v5, use `infer_framework_from_model` instead.", FutureWarning, ) if not is_tf_available() and not is_torch_available(): raise RuntimeError( "At least one of TensorFlow 2.0 or PyTorch should be installed. " "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " "To install PyTorch, read the instructions at https://pytorch.org/." ) if isinstance(model, str): if is_torch_available() and not is_tf_available(): model = AutoModel.from_pretrained(model, revision=revision) elif is_tf_available() and not is_torch_available(): model = TFAutoModel.from_pretrained(model, revision=revision) else: try: model = AutoModel.from_pretrained(model, revision=revision) except OSError: model = TFAutoModel.from_pretrained(model, revision=revision) framework = "tf" if model.__class__.__name__.startswith("TF") else "pt" return framework
def infer_framework_from_model(model, model_classes: Optional[Dict[str, type]] = None, revision: Optional[str] = None, task: Optional[str] = None, onnx: bool = True, graph_path: Optional[Path] = None): """ Select framework (TensorFlow or PyTorch) to use from the :obj:`model` passed. Returns a tuple (framework, model). If :obj:`model` is instantiated, this function will just infer the framework from the model class. Otherwise :obj:`model` is actually a checkpoint name and this method will try to instantiate it using :obj:`model_classes`. Since we don't want to instantiate the model twice, this model is returned for use by the pipeline. If both frameworks are installed and available for :obj:`model`, PyTorch is selected. Args: model (:obj:`str`, :class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`): The model to infer the framework from. If :obj:`str`, a checkpoint name. The model to infer the framewrok from. model_classes (dictionary :obj:`str` to :obj:`type`, `optional`): A mapping framework to class. revision (:obj:`str`, `optional`): The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git. Returns: :obj:`Tuple`: A tuple framework, model. """ if not is_tf_available() and not is_torch_available(): raise RuntimeError( "At least one of TensorFlow 2.0 or PyTorch should be installed. " "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " "To install PyTorch, read the instructions at https://pytorch.org/." ) if (onnx and not os.path.exists(graph_path)) or not onnx: if isinstance(model, str): if is_torch_available() and not is_tf_available(): model_class = model_classes.get("pt", AutoModel) model = model_class.from_pretrained(model, revision=revision) elif is_tf_available() and not is_torch_available(): model_class = model_classes.get("tf", TFAutoModel) model = model_class.from_pretrained(model, revision=revision) else: try: model_class = model_classes.get("pt", AutoModel) model = model_class.from_pretrained(model, revision=revision) except OSError: model_class = model_classes.get("tf", TFAutoModel) model = model_class.from_pretrained(model, revision=revision) framework = "tf" if model.__class__.__name__.startswith( "TF") else "pt" else: framework = "pt" if is_torch_available() else "tf" return framework, model
class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = (TFCLIPTextModel,) if is_tf_available() else () test_pruning = False test_head_masking = False test_onnx = False def setUp(self): self.model_tester = TFCLIPTextModelTester(self) self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_inputs_embeds(self): # CLIP does not use inputs_embeds pass @slow def test_model_from_pretrained(self): for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = TFCLIPTextModel.from_pretrained(model_name) self.assertIsNotNone(model)
def relevent_dialog_convert_examples_to_features( examples: Union[List[InputExample], "tf.data.Dataset"], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length. Defaults to the tokenizer's max_len task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ if is_tf_available() and isinstance(examples, tf.data.Dataset): if task is None: raise ValueError("When calling relevent_dialog_convert_examples_to_features from TF, the task parameter is required.") return _tf_relevent_dialog_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task) return _relevent_dialog_convert_examples_to_features( examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode )
def get_default_model(targeted_task: Dict, framework: Optional[str], task_options: Optional[Any]) -> str: """ Select a default model to use for a given task. Defaults to pytorch if ambiguous. Args: targeted_task (:obj:`Dict` ): Dictionary representing the given task, that should contain default models framework (:obj:`str`, None) "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet. task_options (:obj:`Any`, None) Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for translation task. Returns :obj:`str` The model string representing the default model for this pipeline """ if is_torch_available() and not is_tf_available(): framework = "pt" elif is_tf_available() and not is_torch_available(): framework = "tf" defaults = targeted_task["default"] if task_options: if task_options not in defaults: raise ValueError( f"The task does not provide any default models for options {task_options}" ) default_models = defaults[task_options]["model"] elif "model" in defaults: default_models = targeted_task["default"]["model"] else: # XXX This error message needs to be updated to be more generic if more tasks are going to become # parametrized raise ValueError( 'The task defaults can\'t be correctly selected. You probably meant "translation_XX_to_YY"' ) if framework is None: framework = "pt" return default_models[framework]
def export( tokenizer: PreTrainedTokenizer, model: Union[PreTrainedModel, TFPreTrainedModel], config: OnnxConfig, opset: int, output: Path, ) -> Tuple[List[str], List[str]]: """ Export a Pytorch or TensorFlow model to an ONNX Intermediate Representation (IR) Args: tokenizer ([`PreTrainedTokenizer`]): The tokenizer used for encoding the data. model ([`PreTrainedModel`] or [`TFPreTrainedModel`]): The model to export. config ([`~onnx.config.OnnxConfig`]): The ONNX configuration associated with the exported model. opset (`int`): The version of the ONNX operator set to use. output (`Path`): Directory to store the exported ONNX model. Returns: `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from the ONNX configuration. """ if not (is_torch_available() or is_tf_available()): raise ImportError( "Cannot convert because neither PyTorch nor TensorFlow are not installed. " "Please install torch or tensorflow first.") if is_torch_available(): from transformers.file_utils import torch_version if not is_torch_onnx_dict_inputs_support_available(): raise AssertionError( f"Unsupported PyTorch version, minimum required is 1.8.0, got: {torch_version}" ) if is_torch_available() and issubclass(type(model), PreTrainedModel): return export_pytorch(tokenizer, model, config, opset, output) elif is_tf_available() and issubclass(type(model), TFPreTrainedModel): return export_tensorflow(tokenizer, model, config, opset, output)
def require_retrieval(test_case): """ Decorator marking a test that requires a set of dependencies necessary for pefrorm retrieval with :class:`~transformers.RagRetriever`. These tests are skipped when respective libraries are not installed. """ if not (is_tf_available() and is_datasets_available() and is_faiss_available()): test_case = unittest.skip("test requires tensorflow, datasets and faiss")(test_case) return test_case
def set_seed(seed): random.seed(seed) np.random.seed(seed) if is_torch_available(): torch.manual_seed(seed) #torch.cuda.manual_seed_all(seed) if is_tf_available(): import tensorflow as tf tf.random.set_seed(seed)
def convert_tensorflow(nlp: Pipeline, opset: int, output: Path): """ Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR) Args: nlp: The pipeline to be exported opset: The actual version of the ONNX operator set to use output: Path where will be stored the generated ONNX model Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow """ if not is_tf_available(): raise Exception( "Cannot convert because TF is not installed. Please install tensorflow first." ) print( "/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\" ) try: import tensorflow as tf import tf2onnx from tf2onnx import __version__ as t2ov print( f"Using framework TensorFlow: {tf.version.VERSION}, tf2onnx: {t2ov}" ) # Build input_names, output_names, dynamic_axes, tokens = infer_shapes( nlp, "tf") # Forward nlp.model.predict(tokens.data) input_signature = [ tf.TensorSpec.from_tensor(tensor, name=key) for key, tensor in tokens.items() ] model_proto, _ = tf2onnx.convert.from_keras( nlp.model, input_signature, opset=opset, output_path=output.as_posix()) except ImportError as e: raise Exception( f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first. {e}" )
def get_models(module): """ Get the objects in module that are models.""" models = [] model_classes = () if is_torch_available(): model_classes += (transformers.PreTrainedModel, ) if is_tf_available(): model_classes += (transformers.TFPreTrainedModel, ) for attr_name in dir(module): if "Pretrained" in attr_name or "PreTrained" in attr_name: continue attr = getattr(module, attr_name) if isinstance(attr, type) and issubclass( attr, model_classes) and attr.__module__ == module.__name__: models.append((attr_name, attr)) return models
def __init__(self, args: Namespace): self.logger = logging.get_logger("transformers-cli/training") self.framework = "tf" if is_tf_available() else "torch" os.makedirs(args.output, exist_ok=True) self.output = args.output self.column_label = args.column_label self.column_text = args.column_text self.column_id = args.column_id self.logger.info("Loading {} pipeline for {}".format( args.task, args.model)) if args.task == "text_classification": self.pipeline = TextClassificationPipeline.from_pretrained( args.model) elif args.task == "token_classification": raise NotImplementedError elif args.task == "question_answering": raise NotImplementedError self.logger.info("Loading dataset from {}".format(args.train_data)) self.train_dataset = Processor.create_from_csv( args.train_data, column_label=args.column_label, column_text=args.column_text, column_id=args.column_id, skip_first_row=args.skip_first_row, ) self.valid_dataset = None if args.validation_data: self.logger.info("Loading validation dataset from {}".format( args.validation_data)) self.valid_dataset = Processor.create_from_csv( args.validation_data, column_label=args.column_label, column_text=args.column_text, column_id=args.column_id, skip_first_row=args.skip_first_row, ) self.validation_split = args.validation_split self.train_batch_size = args.train_batch_size self.valid_batch_size = args.valid_batch_size self.learning_rate = args.learning_rate self.adam_epsilon = args.adam_epsilon
def convert_tensorflow(nlp: Pipeline, opset: int, output: Path): """ Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR Args: nlp: The pipeline to be exported opset: The actual version of the ONNX operator set to use output: Path where will be stored the generated ONNX model Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow """ if not is_tf_available(): raise Exception( "Cannot convert because TF is not installed. Please install tensorflow first." ) print( "/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\" ) try: import tensorflow as tf from keras2onnx import __version__ as k2ov from keras2onnx import convert_keras, save_model print( f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}" ) # Build input_names, output_names, dynamic_axes, tokens = infer_shapes( nlp, "tf") # Forward nlp.model.predict(tokens.data) onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset) save_model(onnx_model, output.as_posix()) except ImportError as e: raise Exception( f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first." )
def get_all_auto_configured_models(): """ Return the list of all models in at least one auto class.""" result = set( ) # To avoid duplicates we concatenate all model classes in a set. if is_torch_available(): for attr_name in dir(transformers.modeling_auto): if attr_name.startswith("MODEL_") and attr_name.endswith( "MAPPING"): result = result | set( getattr(transformers.modeling_auto, attr_name).values()) if is_tf_available(): for attr_name in dir(transformers.modeling_tf_auto): if attr_name.startswith("TF_MODEL_") and attr_name.endswith( "MAPPING"): result = result | set( getattr(transformers.modeling_tf_auto, attr_name).values()) return [cls.__name__ for cls in result]
def set_seed(seed: int): """ Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if installed). Args: seed (:obj:`int`): The seed to set. """ random.seed(seed) np.random.seed(seed) if is_torch_available(): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # safe to call this function even if cuda is not available if is_tf_available(): import tensorflow as tf tf.random.set_seed(seed)
def load_graph_from_args(pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs) -> Pipeline: """ Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model Args: pipeline_name: The kind of pipeline to use (ner, question-answering, etc.) framework: The actual model to convert the pipeline from ("pt" or "tf") model: The model name which will be loaded by the pipeline tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value Returns: Pipeline object """ # If no tokenizer provided if tokenizer is None: tokenizer = model # Check the wanted framework is available if framework == "pt" and not is_torch_available(): raise Exception( "Cannot convert because PyTorch is not installed. Please install torch first." ) if framework == "tf" and not is_tf_available(): raise Exception( "Cannot convert because TF is not installed. Please install tensorflow first." ) print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})") # Allocate tokenizer and model return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs)
def convert_examples_to_features(examples, tokenizer, max_length=512, label_list=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, sample_negatives=False): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset): is_tf_dataset = True label_map = {label: i for i, label in enumerate(label_list)} features = [] if examples[0].text_b is not None: k = len(examples[0].text_b) if sample_negatives: neg_indices = [np.random.choice(len(examples), size=len(examples), replace=False) for i in range(k)] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) if is_tf_dataset: example = processor.get_example_from_tensor_dict(example) if type(example.text_a) is list: text_a = example.text_a text_b = [example.text_b]*len(text_a) elif type(example.text_b) is list: text_b = example.text_b if sample_negatives: label_idx = label_map[example.label] text_b_neg = [(examples[neg_indices[i][ex_index]]).text_b[label_idx] for i in range(k)] text_b_neg[label_idx] = text_b[label_idx] text_a = [example.text_a]*len(text_b) else: text_a = [example.text_a] text_b = [example.text_b] if 0: #sample_negatives: print ('Created negative samples') print ('Original example: label:{} text_a: {} text_b1: {}, 2: {}, 3:{}'.format(example.label, text_a[0], text_b[0], text_b[1], text_b[2])) print ('Converted example: text_a: {} text_b1: {}, 2: {}, 3:{}'.format(text_a[0], text_b_neg[0], text_b_neg[1], text_b_neg[2])) def get_indices(t1, t2): out = [] for a,b in zip(t1, t2): inputs = tokenizer.encode_plus( a, b, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) out.append((input_ids, attention_mask, token_type_ids)) if len(t1) == 1: input_ids, attention_mask, token_type_ids = out[0] else: input_ids, attention_mask, token_type_ids = zip(*out) return input_ids, attention_mask, token_type_ids input_ids, attention_mask, token_type_ids = get_indices(text_a, text_b) if sample_negatives: input_ids_n, attention_mask_n, token_type_ids_n = get_indices(text_a, text_b_neg) label = label_map[example.label] if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) if sample_negatives: features.append( InputFeatures(input_ids=input_ids_n, attention_mask=attention_mask_n, token_type_ids=token_type_ids_n, label=label)) if is_tf_available() and is_tf_dataset: def gen(): for ex in features: yield ({'input_ids': ex.input_ids, 'attention_mask': ex.attention_mask, 'token_type_ids': ex.token_type_ids}, ex.label) return tf.data.Dataset.from_generator(gen, ({'input_ids': tf.int32, 'attention_mask': tf.int32, 'token_type_ids': tf.int32}, tf.int64), ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None]), 'token_type_ids': tf.TensorShape([None])}, tf.TensorShape([]))) return features
class TFViTMAEModelTest(TFModelTesterMixin, unittest.TestCase): """ Here we also overwrite some of the tests of test_modeling_common.py, as ViTMAE does not use input_ids, inputs_embeds, attention_mask and seq_length. """ all_model_classes = (TFViTMAEModel, TFViTMAEForPreTraining) if is_tf_available() else () test_pruning = False test_onnx = False test_resize_embeddings = False test_head_masking = False def setUp(self): self.model_tester = TFViTMAEModelTester(self) self.config_tester = ConfigTester(self, config_class=ViTMAEConfig, has_text_modality=False, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() @unittest.skip(reason="ViTMAE does not use inputs_embeds") def test_inputs_embeds(self): # ViTMAE does not use inputs_embeds pass def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) x = model.get_output_embeddings() self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) signature = inspect.signature(model.call) # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] expected_arg_names = ["pixel_values"] self.assertListEqual(arg_names[:1], expected_arg_names) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_for_pretraining(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_pretraining(*config_and_inputs) # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise # to generate masks during test def test_keyword_and_dict_args(self): # make the mask reproducible np.random.seed(2) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() num_patches = int((config.image_size // config.patch_size) ** 2) noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches)) for model_class in self.all_model_classes: model = model_class(config) inputs = self._prepare_for_class(inputs_dict, model_class) outputs_dict = model(inputs, noise=noise) inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) outputs_keywords = model(**inputs_keywords, noise=noise) output_dict = outputs_dict[0].numpy() output_keywords = outputs_keywords[0].numpy() self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6) # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise # to generate masks during test def test_numpy_arrays_inputs(self): # make the mask reproducible np.random.seed(2) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() num_patches = int((config.image_size // config.patch_size) ** 2) noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches)) def prepare_numpy_arrays(inputs_dict): inputs_np_dict = {} for k, v in inputs_dict.items(): if tf.is_tensor(v): inputs_np_dict[k] = v.numpy() else: inputs_np_dict[k] = np.array(k) return inputs_np_dict for model_class in self.all_model_classes: model = model_class(config) inputs = self._prepare_for_class(inputs_dict, model_class) inputs_np = prepare_numpy_arrays(inputs) output_for_dict_input = model(inputs_np, noise=noise) output_for_kw_input = model(**inputs_np, noise=noise) self.assert_outputs_same(output_for_dict_input, output_for_kw_input) def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True # in ViTMAE, the seq_len equals (number of patches + 1) * (1 - mask_ratio), rounded above image_size = to_2tuple(self.model_tester.image_size) patch_size = to_2tuple(self.model_tester.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) seq_len = int(math.ceil((1 - config.mask_ratio) * (num_patches + 1))) encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) chunk_length = getattr(self.model_tester, "chunk_length", None) if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) if chunk_length is not None: self.assertListEqual( list(attentions[0].shape[-4:]), [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length], ) else: self.assertListEqual( list(attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], ) out_len = len(outputs) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) if hasattr(self.model_tester, "num_hidden_states_types"): added_hidden_states = self.model_tester.num_hidden_states_types elif self.is_encoder_decoder: added_hidden_states = 2 else: added_hidden_states = 1 self.assertEqual(out_len + added_hidden_states, len(outputs)) self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) if chunk_length is not None: self.assertListEqual( list(self_attentions[0].shape[-4:]), [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length], ) else: self.assertListEqual( list(self_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], ) def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class)) hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 ) self.assertEqual(len(hidden_states), expected_num_layers) # ViTMAE has a different seq_length image_size = to_2tuple(self.model_tester.image_size) patch_size = to_2tuple(self.model_tester.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) seq_length = int(math.ceil((1 - config.mask_ratio) * (num_patches + 1))) self.assertListEqual( list(hidden_states[0].shape[-2:]), [seq_length, self.model_tester.hidden_size], ) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True check_hidden_states_output(inputs_dict, config, model_class) # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise # to generate masks during test def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict): # make masks reproducible np.random.seed(2) num_patches = int((tf_model.config.image_size // tf_model.config.patch_size) ** 2) noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches)) tf_noise = tf.constant(noise) # Add `noise` argument. # PT inputs will be prepared in `super().check_pt_tf_models()` with this added `noise` argument tf_inputs_dict["noise"] = tf_noise super().check_pt_tf_models(tf_model, pt_model, tf_inputs_dict) # overwrite from common since TFViTMAEForPretraining outputs loss along with # logits and mask indices. loss and mask indicies are not suitable for integration # with other keras modules. def test_compile_tf_model(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") for model_class in self.all_model_classes: # `pixel_values` implies that the input is an image inputs = tf.keras.Input( batch_shape=( 3, self.model_tester.num_channels, self.model_tester.image_size, self.model_tester.image_size, ), name="pixel_values", dtype="float32", ) # Prepare our model model = model_class(config) model(self._prepare_for_class(inputs_dict, model_class)) # Model must be called before saving. # Let's load it from the disk to be sure we can use pretrained weights with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, saved_model=False) model = model_class.from_pretrained(tmpdirname) outputs_dict = model(inputs) hidden_states = outputs_dict[0] # `TFViTMAEForPreTraining` outputs are not recommended to be used for # downstream application. This is just to check if the outputs of # `TFViTMAEForPreTraining` can be integrated with other keras modules. if model_class.__name__ == "TFViTMAEForPreTraining": hidden_states = outputs_dict["logits"] # Add a dense layer on top to test integration with other keras modules outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states) # Compile extended model extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs]) extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise # to generate masks during test def test_keras_save_load(self): # make mask reproducible np.random.seed(2) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() tf_main_layer_classes = set( module_member for model_class in self.all_model_classes for module in (import_module(model_class.__module__),) for module_member_name in dir(module) if module_member_name.endswith("MainLayer") # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`. and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] for module_member in (getattr(module, module_member_name),) if isinstance(module_member, type) and tf.keras.layers.Layer in module_member.__bases__ and getattr(module_member, "_keras_serializable", False) ) num_patches = int((config.image_size // config.patch_size) ** 2) noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches)) noise = tf.convert_to_tensor(noise) inputs_dict.update({"noise": noise}) for main_layer_class in tf_main_layer_classes: main_layer = main_layer_class(config) symbolic_inputs = { name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() } model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) outputs = model(inputs_dict) with tempfile.TemporaryDirectory() as tmpdirname: filepath = os.path.join(tmpdirname, "keras_model.h5") model.save(filepath) model = tf.keras.models.load_model( filepath, custom_objects={main_layer_class.__name__: main_layer_class} ) assert isinstance(model, tf.keras.Model) after_outputs = model(inputs_dict) self.assert_outputs_same(after_outputs, outputs) # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise # to generate masks during test def test_save_load(self): # make mask reproducible np.random.seed(2) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() num_patches = int((config.image_size // config.patch_size) ** 2) noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches)) for model_class in self.all_model_classes: model = model_class(config) model_input = self._prepare_for_class(inputs_dict, model_class) outputs = model(model_input, noise=noise) if model_class.__name__ == "TFViTMAEModel": out_2 = outputs.last_hidden_state.numpy() out_2[np.isnan(out_2)] = 0 else: out_2 = outputs.logits.numpy() out_2[np.isnan(out_2)] = 0 with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, saved_model=True) saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") model = tf.keras.models.load_model(saved_model_dir) after_outputs = model(model_input, noise=noise) if model_class.__name__ == "TFViTMAEModel": out_1 = after_outputs["last_hidden_state"].numpy() out_1[np.isnan(out_1)] = 0 else: out_1 = after_outputs["logits"].numpy() out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5) # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise # to generate masks during test def test_save_load_config(self): # make mask reproducible np.random.seed(2) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() num_patches = int((config.image_size // config.patch_size) ** 2) noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches)) for model_class in self.all_model_classes: model = model_class(config) model_inputs = self._prepare_for_class(inputs_dict, model_class) outputs = model(model_inputs, noise=noise) model_config = model.get_config() # make sure that returned config is jsonifiable, which is required by keras json.dumps(model_config) new_model = model_class.from_config(model.get_config()) # make sure it also accepts a normal config _ = model_class.from_config(model.config) _ = new_model(model_inputs) # Build model new_model.set_weights(model.get_weights()) after_outputs = new_model(model_inputs, noise=noise) self.assert_outputs_same(after_outputs, outputs) @unittest.skip( reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load to get deterministic results.""" ) def test_determinism(self): pass @unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load""") def test_model_outputs_equivalence(self): pass @slow def test_model_from_pretrained(self): model = TFViTMAEModel.from_pretrained("google/vit-base-patch16-224") self.assertIsNotNone(model)
def glue_convert_examples_to_features( examples, tokenizer, max_length=512, task=None, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, load_id=False, load_element_id=False, ): """ Loads a data_utils file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data_utils.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) load_id: add id for each example Returns: If the ``examples`` input is a ``tf.data_utils.Dataset``, will return a ``tf.data_utils.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset): is_tf_dataset = True # logger.info('load_id' + load_id) # logger.info('is_tf_available()' + is_tf_available()) # logger.info('is_tf_dataset' + is_tf_dataset) if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) if is_tf_dataset: example = processor.get_example_from_tensor_dict(example) example = processor.tfds_map(example) inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, return_element_type_ids=load_element_id) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] if load_element_id: element_type_ids = inputs["element_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids if load_element_id: element_type_ids = ([pad_token_segment_id] * padding_length) + element_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) if load_element_id: element_type_ids = element_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) assert len(attention_mask) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length ) assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length ) if load_element_id: assert len(element_type_ids) == max_length, "Error with input length {} vs {}".format( len(element_type_ids), max_length ) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) if ex_index < 6: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) if load_element_id: logger.info("element_type_ids: %s" % " ".join([str(x) for x in element_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) if load_id: if load_element_id: features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, example_id=example.guid, element_type_ids=element_type_ids ) ) else: features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, example_id=example.guid ) ) else: if load_element_id: features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, element_type_ids=element_type_ids ) ) else: features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label ) ) if is_tf_available() and is_tf_dataset: def gen(): for ex in features: yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, }, ex.label, ) return tf.data.Dataset.from_generator( gen, ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), }, tf.TensorShape([]), ), ) return features
class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase): """ Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds, attention_mask and seq_length. """ all_model_classes = (TFCLIPVisionModel,) if is_tf_available() else () test_pruning = False test_resize_embeddings = False test_head_masking = False test_onnx = False def setUp(self): self.model_tester = TFCLIPVisionModelTester(self) self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_inputs_embeds(self): # CLIP does not use inputs_embeds pass def test_graph_mode_with_inputs_embeds(self): # CLIP does not use inputs_embeds pass def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) x = model.get_output_embeddings() self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) signature = inspect.signature(model.call) # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] expected_arg_names = ["pixel_values"] self.assertListEqual(arg_names[:1], expected_arg_names) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) image_size = (self.model_tester.image_size, self.model_tester.image_size) patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) seq_len = num_patches + 1 for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) attentions = outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) attentions = outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) out_len = len(outputs) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) added_hidden_states = 1 self.assertEqual(out_len + added_hidden_states, len(outputs)) self_attentions = outputs.attentions self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(self_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, seq_len, seq_len], ) def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 ) self.assertEqual(len(hidden_states), expected_num_layers) # CLIP has a different seq_length image_size = (self.model_tester.image_size, self.model_tester.image_size) patch_size = (self.model_tester.patch_size, self.model_tester.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) seq_length = num_patches + 1 self.assertListEqual( list(hidden_states[0].shape[-2:]), [seq_length, self.model_tester.hidden_size], ) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True check_hidden_states_output(inputs_dict, config, model_class) # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) @slow def test_model_from_pretrained(self): for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = TFCLIPVisionModel.from_pretrained(model_name) self.assertIsNotNone(model)
get_default_model, infer_framework_from_model, ) from .conversational import Conversation, ConversationalPipeline from .feature_extraction import FeatureExtractionPipeline from .fill_mask import FillMaskPipeline from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline from .text_classification import TextClassificationPipeline from .text_generation import TextGenerationPipeline from .token_classification import NerPipeline, TokenClassificationArgumentHandler, TokenClassificationPipeline from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline if is_tf_available(): import tensorflow as tf from transformers.models.auto.modeling_tf_auto import ( TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING, TFAutoModel, TFAutoModelForCausalLM, TFAutoModelForMaskedLM, TFAutoModelForQuestionAnswering, TFAutoModelForSeq2SeqLM, TFAutoModelForSequenceClassification, TFAutoModelForTokenClassification,
class TFCLIPModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = (TFCLIPModel,) if is_tf_available() else () test_head_masking = False test_pruning = False test_resize_embeddings = False test_attention_outputs = False test_onnx = False def setUp(self): self.model_tester = TFCLIPModelTester(self) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) # hidden_states are tested in individual model tests def test_hidden_states_output(self): pass # input_embeds are tested in individual model tests def test_inputs_embeds(self): pass # CLIPModel does not have input/output embeddings def test_model_common_attributes(self): pass # overwrite from common since `TFCLIPModelTester` set `return_loss` to `True` and causes the preparation of # `symbolic_inputs` failed. def test_keras_save_load(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() # remove `return_loss` to make code work if self.__class__.__name__ == "TFCLIPModelTest": inputs_dict.pop("return_loss", None) tf_main_layer_classes = set( module_member for model_class in self.all_model_classes for module in (import_module(model_class.__module__),) for module_member_name in dir(module) if module_member_name.endswith("MainLayer") # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`. and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")] for module_member in (getattr(module, module_member_name),) if isinstance(module_member, type) and tf.keras.layers.Layer in module_member.__bases__ and getattr(module_member, "_keras_serializable", False) ) for main_layer_class in tf_main_layer_classes: # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter if "T5" in main_layer_class.__name__: # Take the same values than in TFT5ModelTester for this shared layer shared = TFSharedEmbeddings(99, 32, name="shared") config.use_cache = inputs_dict.pop("use_cache", None) main_layer = main_layer_class(config, embed_tokens=shared) else: main_layer = main_layer_class(config) symbolic_inputs = { name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items() } model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs)) outputs = model(inputs_dict) with tempfile.TemporaryDirectory() as tmpdirname: filepath = os.path.join(tmpdirname, "keras_model.h5") model.save(filepath) if "T5" in main_layer_class.__name__: model = tf.keras.models.load_model( filepath, custom_objects={ main_layer_class.__name__: main_layer_class, "TFSharedEmbeddings": TFSharedEmbeddings, }, ) else: model = tf.keras.models.load_model( filepath, custom_objects={main_layer_class.__name__: main_layer_class} ) assert isinstance(model, tf.keras.Model) after_outputs = model(inputs_dict) self.assert_outputs_same(after_outputs, outputs) # overwrite from common since CLIPModel/TFCLIPModel return CLIPOutput/TFCLIPOutput @is_pt_tf_cross_test def test_pt_tf_model_equivalence(self): import torch import transformers config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: pt_model_class_name = model_class.__name__[2:] # Skip the "TF" at the beginning pt_model_class = getattr(transformers, pt_model_class_name) config.output_hidden_states = True tf_model = model_class(config) pt_model = pt_model_class(config) # Check we can load pt model in tf and vice-versa with model => model functions tf_model = transformers.load_pytorch_model_in_tf2_model( tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class) ) pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = {} for name, key in self._prepare_for_class(inputs_dict, model_class).items(): if type(key) == bool: pt_inputs_dict[name] = key elif name == "input_values": pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) elif name == "pixel_values": pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) else: pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) # need to rename encoder-decoder "inputs" for PyTorch if "inputs" in pt_inputs_dict and self.is_encoder_decoder: pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False) self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch") for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()): if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)): continue tf_out = tf_output.numpy() pt_out = pt_output.numpy() self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch") if len(tf_out.shape) > 0: tf_nans = np.copy(np.isnan(tf_out)) pt_nans = np.copy(np.isnan(pt_out)) pt_out[tf_nans] = 0 tf_out[tf_nans] = 0 pt_out[pt_nans] = 0 tf_out[pt_nans] = 0 max_diff = np.amax(np.abs(tf_out - pt_out)) self.assertLessEqual(max_diff, 4e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions with tempfile.TemporaryDirectory() as tmpdirname: pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() pt_inputs_dict = {} for name, key in self._prepare_for_class(inputs_dict, model_class).items(): if type(key) == bool: key = np.array(key, dtype=bool) pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long) elif name == "input_values": pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) elif name == "pixel_values": pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32) else: pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long) # need to rename encoder-decoder "inputs" for PyTorch if "inputs" in pt_inputs_dict and self.is_encoder_decoder: pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs") with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(self._prepare_for_class(inputs_dict, model_class)) self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch") for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()): if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)): continue tf_out = tf_output.numpy() pt_out = pt_output.numpy() self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch") if len(tf_out.shape) > 0: tf_nans = np.copy(np.isnan(tf_out)) pt_nans = np.copy(np.isnan(pt_out)) pt_out[tf_nans] = 0 tf_out[tf_nans] = 0 pt_out[pt_nans] = 0 tf_out[pt_nans] = 0 max_diff = np.amax(np.abs(tf_out - pt_out)) self.assertLessEqual(max_diff, 4e-2) @slow def test_model_from_pretrained(self): for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = TFCLIPModel.from_pretrained(model_name) self.assertIsNotNone(model)
def squad_convert_examples_to_features( examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1 ): """ Converts a list of examples into a list of features that can be directly given as input to a model. It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. Args: examples: list of :class:`~transformers.data.processors.squad.SquadExample` tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer` max_seq_length: The maximum sequence length of the inputs. doc_stride: The stride used when the context is too large and is split across several features. max_query_length: The maximum length of the query. is_training: whether to create features for model evaluation or model training. return_dataset: Default False. Either 'pt' or 'tf'. if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset threads: multiple processing threadsa-smi Returns: list of :class:`~transformers.data.processors.squad.SquadFeatures` Example:: processor = SquadV2Processor() examples = processor.get_dev_examples(data_dir) features = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, ) """ # Defining helper methods features = [] threads = min(threads, cpu_count()) if threads == 1: print("squad_convert_examples_to_features") features = [] for eg in tqdm(examples, total=len(examples), desc="convert squad examples to features"): feat = squad_convert_example_to_features_sp( eg, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training, tokenizer_for_convert=tokenizer) features.append(feat) else: print("squad_convert_examples_to_features w/ {} threads".format(threads)) with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: annotate_ = partial( squad_convert_example_to_features, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training, ) features = list( tqdm( p.imap(annotate_, examples, chunksize=32), total=len(examples), desc="convert squad examples to features", ) ) new_features = [] unique_id = 1000000000 example_index = 0 for example_features in tqdm(features, total=len(features), desc="add example index and unique id"): if not example_features: continue for example_feature in example_features: example_feature.example_index = example_index example_feature.unique_id = unique_id new_features.append(example_feature) unique_id += 1 example_index += 1 features = new_features del new_features if return_dataset == "pt": if not is_torch_available(): raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.") # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if not is_training: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask ) else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset( all_input_ids, all_attention_masks, all_token_type_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask, ) return features, dataset elif return_dataset == "tf": if not is_tf_available(): raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.") def gen(): for ex in features: yield ( { "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, }, { "start_position": ex.start_position, "end_position": ex.end_position, "cls_index": ex.cls_index, "p_mask": ex.p_mask, }, ) return tf.data.Dataset.from_generator( gen, ( {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, {"start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32}, ), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), "token_type_ids": tf.TensorShape([None]), }, { "start_position": tf.TensorShape([]), "end_position": tf.TensorShape([]), "cls_index": tf.TensorShape([]), "p_mask": tf.TensorShape([None]), }, ), ) return features
class TFSpeech2TextModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( TFSpeech2TextModel, TFSpeech2TextForConditionalGeneration) if is_tf_available() else () all_generative_model_classes = ( TFSpeech2TextForConditionalGeneration, ) if is_tf_available() else () is_encoder_decoder = True test_pruning = False test_missing_keys = False test_onnx = False input_name = "input_ids" def setUp(self): self.model_tester = TFSpeech2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=Speech2TextConfig) self.maxDiff = 3000 def test_config(self): self.config_tester.run_common_tests() def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_past_large_inputs( *config_and_inputs) # not implemented currently def test_inputs_embeds(self): pass # training is not supported yet def test_training(self): pass def test_training_gradient_checkpointing(self): pass def test_generate_fp16(self): pass def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) outputs = model( **self._prepare_for_class(inputs_dict, model_class)) hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1) self.assertEqual(len(hidden_states), expected_num_layers) if hasattr(self.model_tester, "encoder_seq_length"): seq_length = self.model_tester.encoder_seq_length else: seq_length = self.model_tester.seq_length subsampled_seq_length = model._get_feat_extract_output_lengths( seq_length) self.assertListEqual( list(hidden_states[0].shape[-2:]), [subsampled_seq_length, self.model_tester.hidden_size], ) if config.is_encoder_decoder: hidden_states = outputs.decoder_hidden_states self.assertIsInstance(hidden_states, (list, tuple)) self.assertEqual(len(hidden_states), expected_num_layers) seq_len = getattr(self.model_tester, "seq_length", None) decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) self.assertListEqual( list(hidden_states[0].shape[-2:]), [decoder_seq_length, self.model_tester.hidden_size], ) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True check_hidden_states_output(inputs_dict, config, model_class) # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.return_dict = True seq_len = getattr(self.model_tester, "seq_length", None) decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True model = model_class(config) subsampled_encoder_seq_length = model._get_feat_extract_output_lengths( encoder_seq_length) subsampled_encoder_key_length = model._get_feat_extract_output_lengths( encoder_key_length) outputs = model( **self._prepare_for_class(inputs_dict, model_class)) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True model = model_class(config) outputs = model( **self._prepare_for_class(inputs_dict, model_class)) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length ], ) out_len = len(outputs) correct_outlen = 5 # loss is at first position if "labels" in inputs_dict: correct_outlen += 1 # loss is added to beginning if "past_key_values" in outputs: correct_outlen += 1 # past_key_values have been returned self.assertEqual(out_len, correct_outlen) # decoder attentions decoder_attentions = outputs.decoder_attentions self.assertIsInstance(decoder_attentions, (list, tuple)) self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length ], ) # cross attentions cross_attentions = outputs.cross_attentions self.assertIsInstance(cross_attentions, (list, tuple)) self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(cross_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, decoder_seq_length, subsampled_encoder_key_length, ], ) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = True model = model_class(config) outputs = model( **self._prepare_for_class(inputs_dict, model_class)) added_hidden_states = 2 self.assertEqual(out_len + added_hidden_states, len(outputs)) self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(self_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length ], ) def test_resize_token_embeddings(self): # Overwritten method from parent; see `test_resize_embeddings_untied` pass def test_resize_tokens_embeddings(self): # see `test_resize_embeddings_untied` pass def test_resize_embeddings_untied(self): # TODO: copy test from PT. Not working at the moment because the test relies on `model.resize_token_embeddings`, # whose TF implementation assumes the use of `TFWrappedEmbeddings`. But with a `TFWrappedEmbeddings` we can't # load the weights from PT (also, it induces TF1 behavior, so we might want to rework how # `model.resize_token_embeddings` operates). pass def test_generate_without_input_ids(self): pass @staticmethod def _get_encoder_outputs(model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1): encoder = model.get_encoder() encoder_outputs = encoder( input_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) encoder_outputs["last_hidden_state"] = tf.repeat( encoder_outputs.last_hidden_state, num_interleave, axis=0) input_ids = input_ids[:, :, 0] input_ids = tf.zeros_like( input_ids[:, :1], dtype=tf.int64) + model._get_decoder_start_token_id() attention_mask = None return encoder_outputs, input_ids, attention_mask def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1): batch_size, seq_length = input_ids.shape[:2] subsampled_seq_length = self.model_tester.get_subsampled_output_lengths( seq_length) num_sequences_in_output = batch_size * num_return_sequences gen_len = (output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length) # scores self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) # Attentions # encoder self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, subsampled_seq_length) # decoder self._check_attentions_for_generate( num_sequences_in_output, output.decoder_attentions, min_length=1, max_length=output.sequences.shape[-1], config=config, use_cache=use_cache, ) # Hidden States # encoder self._check_encoder_hidden_states_for_generate( output.encoder_hidden_states, batch_size, config, subsampled_seq_length) # decoder self._check_hidden_states_for_generate( num_sequences_in_output, output.decoder_hidden_states, min_length=1, max_length=output.sequences.shape[-1], config=config, use_cache=use_cache, ) # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is # `input_features` def test_lm_head_model_random_no_beam_search_generate(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) input_features = inputs_dict.get("input_features", None) # iterate over all generative models for model_class in self.all_generative_model_classes: model = model_class(config) if config.bos_token_id is None: # if bos token id is not defined model needs input_features with self.assertRaises(AssertionError): model.generate(do_sample=True, max_length=5) # num_return_sequences = 1 self._check_generated_ids( model.generate(input_features, do_sample=True)) with self.assertRaises(AssertionError): # generating multiple sequences when no beam search generation # is not allowed as it would always generate the same sequences model.generate(input_features, do_sample=False, num_return_sequences=2) # num_return_sequences > 1, sample self._check_generated_ids( model.generate(input_features, do_sample=True, num_return_sequences=2)) # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens bad_words_ids = [ self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model) ] output_tokens = model.generate(input_features, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2) # only count generated tokens generated_ids = output_tokens[:, input_features.shape[-1]:] self.assertFalse( self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is # `input_features` def test_lm_head_model_random_beam_search_generate(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) input_features = inputs_dict.get("input_features", None) for model_class in self.all_generative_model_classes: model = model_class(config) if config.bos_token_id is None: # if bos token id is not defined model needs input_ids, num_return_sequences = 1 self._check_generated_ids( model.generate(input_features, do_sample=True, num_beams=2)) with self.assertRaises(AssertionError): # generating more sequences than having beams leads is not possible model.generate(input_features, do_sample=False, num_return_sequences=3, num_beams=2) # num_return_sequences > 1, sample self._check_generated_ids( model.generate( input_features, do_sample=True, num_beams=2, num_return_sequences=2, )) # num_return_sequences > 1, greedy self._check_generated_ids( model.generate(input_features, do_sample=False, num_beams=2, num_return_sequences=2)) # check bad words tokens language generation # create list of 1-seq bad token and list of 2-seq of bad tokens bad_words_ids = [ self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model) ] output_tokens = model.generate(input_features, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2) # only count generated tokens generated_ids = output_tokens[:, input_features.shape[-1]:] self.assertFalse( self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids)) # overwritten from parent -- the input is `input_features`, not `input_ids` def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) signature = inspect.signature(model.call) # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] expected_arg_names = [ "input_features", "attention_mask", "decoder_input_ids", "decoder_attention_mask", ] self.assertListEqual(arg_names[:len(expected_arg_names)], expected_arg_names)
class TFViTModelTest(TFModelTesterMixin, unittest.TestCase): """ Here we also overwrite some of the tests of test_modeling_tf_common.py, as ViT does not use input_ids, inputs_embeds, attention_mask and seq_length. """ all_model_classes = ( TFViTModel, TFViTForImageClassification) if is_tf_available() else () test_resize_embeddings = False test_head_masking = False test_onnx = False def setUp(self): self.model_tester = TFViTModelTester(self) self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() def test_inputs_embeds(self): # ViT does not use inputs_embeds pass def test_graph_mode_with_inputs_embeds(self): # ViT does not use inputs_embeds pass def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) x = model.get_output_embeddings() self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) signature = inspect.signature(model.call) # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] expected_arg_names = ["pixel_values"] self.assertListEqual(arg_names[:1], expected_arg_names) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) # overwrite from common since `encoder_seq_length` and `encoder_key_length` are calculated # in a different way than in text models. @tooslow def test_saved_model_creation_extended(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.output_hidden_states = True config.output_attentions = True if hasattr(config, "use_cache"): config.use_cache = True # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) image_size = to_2tuple(self.model_tester.image_size) patch_size = to_2tuple(self.model_tester.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) seq_len = num_patches + 1 encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) for model_class in self.all_model_classes: class_inputs_dict = self._prepare_for_class( inputs_dict, model_class) model = model_class(config) num_out = len(model(class_inputs_dict)) with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, saved_model=True) saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") model = tf.keras.models.load_model(saved_model_dir) outputs = model(class_inputs_dict) if self.is_encoder_decoder: output_hidden_states = outputs["encoder_hidden_states"] output_attentions = outputs["encoder_attentions"] else: output_hidden_states = outputs["hidden_states"] output_attentions = outputs["attentions"] self.assertEqual(len(outputs), num_out) expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1) self.assertEqual(len(output_hidden_states), expected_num_layers) self.assertListEqual( list(output_hidden_states[0].shape[-2:]), [seq_len, self.model_tester.hidden_size], ) self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(output_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length ], ) def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.return_dict = True # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) image_size = to_2tuple(self.model_tester.image_size) patch_size = to_2tuple(self.model_tester.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) seq_len = num_patches + 1 encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length ], ) out_len = len(outputs) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) if hasattr(self.model_tester, "num_hidden_states_types"): added_hidden_states = self.model_tester.num_hidden_states_types elif self.is_encoder_decoder: added_hidden_states = 2 else: added_hidden_states = 1 self.assertEqual(out_len + added_hidden_states, len(outputs)) self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(self_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length ], ) def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) outputs = model( **self._prepare_for_class(inputs_dict, model_class)) hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1) self.assertEqual(len(hidden_states), expected_num_layers) # ViT has a different seq_length image_size = to_2tuple(self.model_tester.image_size) patch_size = to_2tuple(self.model_tester.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) seq_length = num_patches + 1 self.assertListEqual( list(hidden_states[0].shape[-2:]), [seq_length, self.model_tester.hidden_size], ) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True check_hidden_states_output(inputs_dict, config, model_class) # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) def test_for_image_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_image_classification( *config_and_inputs) @slow def test_model_from_pretrained(self): model = TFViTModel.from_pretrained("google/vit-base-patch16-224") self.assertIsNotNone(model)
import tempfile import unittest from transformers.file_utils import cached_property, is_datasets_available, is_faiss_available, is_tf_available from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow if is_tf_available() and is_datasets_available() and is_faiss_available(): import tensorflow as tf from transformers import ( AutoConfig, BartTokenizer, DPRQuestionEncoderTokenizer, RagConfig, RagRetriever, RagTokenizer, TFBartForConditionalGeneration, TFDPRQuestionEncoder, TFRagModel, TFRagSequenceForGeneration, TFRagTokenForGeneration, ) def require_retrieval(test_case): """ Decorator marking a test that requires a set of dependencies necessary for pefrorm retrieval with :class:`~transformers.RagRetriever`. These tests are skipped when respective libraries are not installed.
class TFRagTestMixin: all_model_classes = ((TFRagModel, TFRagTokenForGeneration, TFRagSequenceForGeneration) if is_tf_available() and is_datasets_available() and is_faiss_available() else ()) all_generative_model_classes = ( (TFRagTokenForGeneration, TFRagSequenceForGeneration) if is_tf_available() and is_datasets_available() and is_faiss_available() else ()) retrieval_vector_size = 32 n_docs = 3 max_combined_length = 16 def setUp(self): self.tmpdirname = tempfile.mkdtemp() # DPR tok vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ",", "low", "lowest", ] dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer") os.makedirs(dpr_tokenizer_path, exist_ok=True) self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"]) with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) # BART tok vocab = [ "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>", ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = [ "#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", "" ] self.special_tokens_map = {"unk_token": "<unk>"} bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer") os.makedirs(bart_tokenizer_path, exist_ok=True) self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"]) self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"]) with open(self.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") with open(self.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) @cached_property def dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer: return DPRQuestionEncoderTokenizer.from_pretrained( os.path.join(self.tmpdirname, "dpr_tokenizer")) @cached_property def bart_tokenizer(self) -> BartTokenizer: return BartTokenizer.from_pretrained( os.path.join(self.tmpdirname, "bart_tokenizer")) def tearDown(self): shutil.rmtree(self.tmpdirname) def get_retriever(self, config): dataset = Dataset.from_dict({ "id": ["0", "1", "3"], "text": ["foo", "bar", "qux"], "title": ["Foo", "Bar", "Qux"], "embeddings": [ np.ones(self.retrieval_vector_size), 2 * np.ones(self.retrieval_vector_size), 3 * np.ones(self.retrieval_vector_size), ], }) dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT) tokenizer = self.bart_tokenizer with patch("transformers.models.rag.retrieval_rag.load_dataset" ) as mock_load_dataset: mock_load_dataset.return_value = dataset retriever = RagRetriever( config, question_encoder_tokenizer=self.dpr_tokenizer, generator_tokenizer=tokenizer, ) return retriever def check_model_with_retriever(self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs): self.assertIsNotNone(config.question_encoder) self.assertIsNotNone(config.generator) for model_class in self.all_model_classes: model = model_class(config, retriever=self.get_retriever(config)) self.assertTrue(model.config.is_encoder_decoder) outputs = model( input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) # logits self.assertEqual( outputs.logits.shape, (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size), ) # generator encoder last hidden states self.assertEqual( outputs.generator_enc_last_hidden_state.shape, (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size), ) # doc scores self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs)) def check_model_generate_from_context_input_ids(self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs): self.assertIsNotNone(config.question_encoder) self.assertIsNotNone(config.generator) retriever = self.get_retriever(config) for i, model_class in enumerate(self.all_generative_model_classes): model = model_class(config) self.assertTrue(model.config.is_encoder_decoder) question_hidden_states = model.question_encoder( input_ids, attention_mask=attention_mask)[0] out = retriever( input_ids, question_hidden_states.numpy(), prefix=config.generator.prefix, return_tensors="tf", ) context_input_ids, context_attention_mask, retrieved_doc_embeds = ( out["context_input_ids"], out["context_attention_mask"], out["retrieved_doc_embeds"], ) retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32) # compute doc_scores doc_scores = tf.squeeze( tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True), axis=[1], ) outputs = model.generate( context_input_ids=context_input_ids, context_attention_mask=context_attention_mask, doc_scores=doc_scores, ) self.assertIsNotNone(outputs) def check_model_generate(self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs): self.assertIsNotNone(config.question_encoder) self.assertIsNotNone(config.generator) for model_class in self.all_generative_model_classes: model = model_class(config, retriever=self.get_retriever(config)) self.assertTrue(model.config.is_encoder_decoder) input_ids = tf.cast(input_ids, tf.int32) outputs = model.generate( input_ids=input_ids, num_beams=2, num_return_sequences=2, decoder_start_token_id=config.generator.eos_token_id, ) self.assertIsNotNone(outputs) def check_model_without_retriever(self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs): self.assertIsNotNone(config.question_encoder) self.assertIsNotNone(config.generator) retriever = self.get_retriever(config) for model_class in self.all_model_classes: model = model_class(config) self.assertTrue(model.config.is_encoder_decoder) question_hidden_states = model.question_encoder( input_ids, attention_mask=attention_mask)[0] out = retriever( input_ids, question_hidden_states.numpy(), prefix=config.generator.prefix, return_tensors="tf", ) context_input_ids, context_attention_mask, retrieved_doc_embeds = ( out["context_input_ids"], out["context_attention_mask"], out["retrieved_doc_embeds"], ) retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32) # compute doc_scores doc_scores = tf.squeeze( tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True), axis=[1], ) outputs = model( input_ids=None, context_input_ids=context_input_ids, context_attention_mask=context_attention_mask, doc_scores=doc_scores, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) # logits self.assertEqual( outputs.logits.shape, (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size), ) # generator encoder last hidden states self.assertEqual( outputs.generator_enc_last_hidden_state.shape, (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size), ) # doc scores self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs)) def check_model_custom_n_docs(self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, n_docs, **kwargs): self.assertIsNotNone(config.question_encoder) self.assertIsNotNone(config.generator) retriever = self.get_retriever(config) for model_class in self.all_model_classes: model = model_class(config) self.assertTrue(model.config.is_encoder_decoder) question_hidden_states = model.question_encoder( input_ids, attention_mask=attention_mask)[0] out = retriever( input_ids, question_hidden_states.numpy(), prefix=config.generator.prefix, return_tensors="tf", n_docs=n_docs, ) context_input_ids, context_attention_mask, retrieved_doc_embeds = ( out["context_input_ids"], out["context_attention_mask"], out["retrieved_doc_embeds"], ) retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32) # compute doc_scores doc_scores = tf.squeeze( tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True), axis=[1], ) outputs = model( input_ids=None, context_input_ids=context_input_ids, context_attention_mask=context_attention_mask, doc_scores=doc_scores, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, n_docs=n_docs, ) # logits self.assertEqual( outputs.logits.shape, (n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size), ) # generator encoder last hidden states self.assertEqual( outputs.generator_enc_last_hidden_state.shape, (n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size), ) # doc scores self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], n_docs)) def check_model_with_mismatch_n_docs_value(self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, retriever_n_docs, generator_n_docs, **kwargs): self.assertIsNotNone(config.question_encoder) self.assertIsNotNone(config.generator) retriever = self.get_retriever(config) for model_class in self.all_model_classes: model = model_class(config) self.assertTrue(model.config.is_encoder_decoder) question_hidden_states = model.question_encoder( input_ids, attention_mask=attention_mask)[0] out = retriever( input_ids, question_hidden_states.numpy(), prefix=config.generator.prefix, return_tensors="tf", n_docs=retriever_n_docs, ) context_input_ids, context_attention_mask, retrieved_doc_embeds = ( out["context_input_ids"], out["context_attention_mask"], out["retrieved_doc_embeds"], ) retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32) # compute doc_scores doc_scores = tf.squeeze( tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]), retrieved_doc_embeds, transpose_b=True), axis=[1], ) self.assertRaises( AssertionError, model.__call__, input_ids=None, context_input_ids=context_input_ids, context_attention_mask=context_attention_mask, doc_scores=doc_scores, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, n_docs=generator_n_docs, ) def check_model_with_encoder_outputs(self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs): self.assertIsNotNone(config.question_encoder) self.assertIsNotNone(config.generator) for model_class in self.all_model_classes: model = model_class(config, retriever=self.get_retriever(config)) self.assertTrue(model.config.is_encoder_decoder) outputs = model( input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) encoder_outputs = TFBaseModelOutput( outputs.generator_enc_last_hidden_state) # run only generator outputs = model( input_ids=None, encoder_outputs=encoder_outputs, doc_scores=outputs.doc_scores, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, ) # logits self.assertEqual( outputs.logits.shape, (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size), ) # generator encoder last hidden states self.assertEqual( outputs.generator_enc_last_hidden_state.shape, (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size), ) # doc scores self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs)) def test_model_with_retriever(self): inputs_dict = self.config_and_inputs self.check_model_with_retriever(**inputs_dict) def test_model_without_retriever(self): inputs_dict = self.config_and_inputs self.check_model_without_retriever(**inputs_dict) def test_model_generate_from_context_input_ids(self): inputs_dict = self.config_and_inputs self.check_model_generate_from_context_input_ids(**inputs_dict) def test_model_with_encoder_outputs(self): inputs_dict = self.config_and_inputs self.check_model_with_encoder_outputs(**inputs_dict) def test_model_generate(self): inputs_dict = self.config_and_inputs self.check_model_generate(**inputs_dict) def test_model_with_custom_n_docs(self): inputs_dict = self.config_and_inputs inputs_dict["n_docs"] = 1 self.check_model_custom_n_docs(**inputs_dict) def test_model_with_mismatch_n_docs_value(self): inputs_dict = self.config_and_inputs inputs_dict["retriever_n_docs"] = 3 inputs_dict["generator_n_docs"] = 2 self.check_model_with_mismatch_n_docs_value(**inputs_dict)
# limitations under the License. """ Testing suite for the TensorFlow ViT model. """ import inspect import os import tempfile import unittest from transformers import ViTConfig from transformers.file_utils import cached_property, is_tf_available, is_vision_available from transformers.testing_utils import require_tf, require_vision, slow, tooslow from .test_configuration_common import ConfigTester from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor if is_tf_available(): import tensorflow as tf from transformers import TFViTForImageClassification, TFViTModel from transformers.models.vit.modeling_tf_vit import to_2tuple if is_vision_available(): from PIL import Image from transformers import ViTFeatureExtractor class TFViTModelTester: def __init__( self, parent,
class TFData2VecVisionModelTest(TFModelTesterMixin, unittest.TestCase): """ Here we also overwrite some of the tests of test_modeling_common.py, as Data2VecVision does not use input_ids, inputs_embeds, attention_mask and seq_length. """ all_model_classes = ((TFData2VecVisionModel, TFData2VecVisionForImageClassification, TFData2VecVisionForSemanticSegmentation) if is_tf_available() else ()) test_pruning = False test_onnx = False test_resize_embeddings = False test_head_masking = False def setUp(self): self.model_tester = TFData2VecVisionModelTester(self) self.config_tester = ConfigTester(self, config_class=Data2VecVisionConfig, has_text_modality=False, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() @unittest.skip(reason="Data2VecVision does not use inputs_embeds") def test_inputs_embeds(self): # Data2VecVision does not use inputs_embeds pass def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer)) x = model.get_output_embeddings() self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) signature = inspect.signature(model.call) # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] expected_arg_names = ["pixel_values"] self.assertListEqual(arg_names[:1], expected_arg_names) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) def test_for_image_segmentation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_image_segmentation( *config_and_inputs) @unittest.skip("Test was written for TF 1.x and isn't really relevant here" ) def test_compile_tf_model(self): pass def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) config.return_dict = True # in Data2VecVision, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token) image_size = (self.model_tester.image_size if isinstance( self.model_tester.image_size, collections.abc.Iterable) else (self.model_tester.image_size, self.model_tester.image_size)) patch_size = (self.model_tester.patch_size if isinstance( self.model_tester.patch_size, collections.abc.Iterable) else (self.model_tester.patch_size, self.model_tester.patch_size)) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) seq_len = num_patches + 1 encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) chunk_length = getattr(self.model_tester, "chunk_length", None) if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length ], ) out_len = len(outputs) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = True model = model_class(config) outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False) self.assertEqual(out_len + 1, len(outputs)) self_attentions = outputs.attentions self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(self_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length ], ) def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) outputs = model( **self._prepare_for_class(inputs_dict, model_class)) hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1) self.assertEqual(len(hidden_states), expected_num_layers) # Data2VecVision has a different seq_length image_size = (self.model_tester.image_size if isinstance( self.model_tester.image_size, collections.abc.Iterable) else (self.model_tester.image_size, self.model_tester.image_size)) patch_size = (self.model_tester.patch_size if isinstance( self.model_tester.patch_size, collections.abc.Iterable) else (self.model_tester.patch_size, self.model_tester.patch_size)) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) seq_length = num_patches + 1 self.assertListEqual( list(hidden_states[0].shape[-2:]), [seq_length, self.model_tester.hidden_size], ) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: inputs_dict["output_hidden_states"] = True check_hidden_states_output(inputs_dict, config, model_class) # check that output_hidden_states also work using config del inputs_dict["output_hidden_states"] config.output_hidden_states = True check_hidden_states_output(inputs_dict, config, model_class) # Overriding this method since the base method won't be compatible with Data2VecVision. def test_keras_fit(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: # Since `TFData2VecVisionModel` cannot operate with the default `fit()` method. if model_class.__name__ != "TFData2VecVisionModel": model = model_class(config) if getattr(model, "hf_compute_loss", None): # Test that model correctly compute the loss with kwargs _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit( ) label_names = {"labels"} self.assertGreater(len(label_names), 0, msg="No matching label names found!") labels = { key: val for key, val in prepared_for_class.items() if key in label_names } inputs_minus_labels = { key: val for key, val in prepared_for_class.items() if key not in label_names } self.assertGreater(len(inputs_minus_labels), 0) model.compile(optimizer=tf.keras.optimizers.SGD(0.0), run_eagerly=True) # Make sure the model fits without crashing regardless of where we pass the labels history1 = model.fit( prepared_for_class, validation_data=prepared_for_class, steps_per_epoch=1, validation_steps=1, shuffle=False, ) val_loss1 = history1.history["val_loss"][0] history2 = model.fit( inputs_minus_labels, labels, validation_data=(inputs_minus_labels, labels), steps_per_epoch=1, validation_steps=1, shuffle=False, ) val_loss2 = history2.history["val_loss"][0] self.assertTrue( np.allclose(val_loss1, val_loss2, atol=1e-2, rtol=1e-3)) def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-4, name="outputs", attributes=None): # We override with a slightly higher tol value, as semseg models tend to diverge a bit more super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes) # Overriding this method since the base method won't be compatible with Data2VecVision. def test_loss_computation(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) for model_class in self.all_model_classes: # Since `TFData2VecVisionModel` won't have labels against which we # could compute loss. if model_class.__name__ != "TFData2VecVisionModel": model = model_class(config) if getattr(model, "hf_compute_loss", None): # The number of elements in the loss should be the same as the number of elements in the label _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit( ) added_label = prepared_for_class[sorted( list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]] loss_size = tf.size(added_label) # Test that model correctly compute the loss with kwargs possible_input_names = { "input_ids", "pixel_values", "input_features" } input_name = possible_input_names.intersection( set(prepared_for_class)).pop() model_input = prepared_for_class.pop(input_name) loss = model(model_input, **prepared_for_class)[0] self.assertEqual(loss.shape, [loss_size]) # Test that model correctly compute the loss with a dict _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit( ) loss = model(**prepared_for_class)[0] self.assertEqual(loss.shape, [loss_size]) # Test that model correctly compute the loss with a tuple label_keys = prepared_for_class.keys() - inputs_dict.keys() signature = inspect.signature(model.call).parameters signature_names = list(signature.keys()) # Create a dictionary holding the location of the tensors in the tuple tuple_index_mapping = {0: input_name} for label_key in label_keys: label_key_index = signature_names.index(label_key) tuple_index_mapping[label_key_index] = label_key sorted_tuple_index_mapping = sorted( tuple_index_mapping.items()) # Initialize a list with their default values, update the values and convert to a tuple list_input = [] for name in signature_names: if name != "kwargs": list_input.append(signature[name].default) for index, value in sorted_tuple_index_mapping: list_input[index] = prepared_for_class[value] tuple_input = tuple(list_input) # Send to model loss = model(tuple_input[:-1])[0] self.assertEqual(loss.shape, [loss_size]) def test_for_image_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_image_classification( *config_and_inputs) @slow def test_model_from_pretrained(self): for model_name in TF_DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = TFData2VecVisionModel.from_pretrained(model_name) self.assertIsNotNone(model)
def get_features( self, tokenizer, max_length=None, pad_on_left=False, pad_token=0, mask_padding_with_zero=True, return_tensors=None, ): """ Convert examples in a list of ``InputFeatures`` Args: tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ if max_length is None: max_length = tokenizer.max_len label_map = {label: i for i, label in enumerate(self.labels)} all_input_ids = [] for (ex_index, example) in enumerate(self.examples): if ex_index % 10000 == 0: logger.info("Tokenizing example %d", ex_index) input_ids = tokenizer.encode( example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len), ) all_input_ids.append(input_ids) batch_length = max(len(input_ids) for input_ids in all_input_ids) features = [] for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, self.examples)): if ex_index % 10000 == 0: logger.info("Writing example %d/%d" % (ex_index, len(self.examples))) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0 ] * len(input_ids) # Zero-pad up to the sequence length. padding_length = batch_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) assert len( input_ids ) == batch_length, "Error with input length {} vs {}".format( len(input_ids), batch_length) assert len( attention_mask ) == batch_length, "Error with input length {} vs {}".format( len(attention_mask), batch_length) if self.mode == "classification": label = label_map[example.label] elif self.mode == "regression": label = float(example.label) else: raise ValueError(self.mode) if ex_index < 5 and self.verbose: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label)) if return_tensors is None: return features elif return_tensors == "tf": if not is_tf_available(): raise RuntimeError( "return_tensors set to 'tf' but TensorFlow 2.0 can't be imported" ) import tensorflow as tf def gen(): for ex in features: yield ({ "input_ids": ex.input_ids, "attention_mask": ex.attention_mask }, ex.label) dataset = tf.data.Dataset.from_generator( gen, ({ "input_ids": tf.int32, "attention_mask": tf.int32 }, tf.int64), ({ "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]) }, tf.TensorShape([])), ) return dataset elif return_tensors == "pt": if not is_torch_available(): raise RuntimeError( "return_tensors set to 'pt' but PyTorch can't be imported") import torch from torch.utils.data import TensorDataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in features], dtype=torch.long) if self.mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif self.mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) return dataset else: raise ValueError("return_tensors should be one of 'tf' or 'pt'")
def get_features( self, max_seq_length, tokenizer, return_tensors, cls_token_at_end=False, cls_token="[CLS]", cls_token_segment_id=0, sep_token="[SEP]", sep_token_extra=False, pad_on_left=False, pad_token=0, pad_token_segment_id=0, pad_token_label_id=-100, sequence_a_segment_id=0, mask_padding_with_zero=True, ): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ label_map = {label: i for i, label in enumerate(self.labels)} features = [] # [[inout_ids, input_mask, segment_ids, label_id]] for (ex_index, example) in enumerate(self.examples): if ex_index % 10000 == 0: logger.info("Writing example %d of %d", ex_index, len(self.examples)) tokens = [] label_ids = [] for word, label in zip(example.text_a, example.label): word_tokens = tokenizer.tokenize(word) tokens.extend(word_tokens) # Use the real label id for the first token of the word, and padding ids for the remaining tokens label_ids.extend( [label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1) ) # some language like german one word will be tokenized to several subwords # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = 3 if sep_token_extra else 2 if len(tokens) > max_seq_length - special_tokens_count: tokens = tokens[:(max_seq_length - special_tokens_count)] label_ids = label_ids[:(max_seq_length - special_tokens_count)] tokens += [sep_token] label_ids += [pad_token_label_id] if sep_token_extra: # roberta uses an extra separator b/w pairs of sentences tokens += [sep_token] label_ids += [pad_token_label_id] segment_ids = [pad_token_segment_id] * len(tokens) if cls_token_at_end: tokens += [cls_token] label_ids += [pad_token_label_id] segment_ids += [cls_token_segment_id] else: tokens = [cls_token] + tokens label_ids = [pad_token_label_id] + label_ids segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids( tokens) # 输入的是列表, 返回的也是列表 # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids label_ids = ([pad_token_label_id] * padding_length) + label_ids else: input_ids += [pad_token] * padding_length input_mask += [0 if mask_padding_with_zero else 1 ] * padding_length segment_ids += [pad_token_segment_id] * padding_length label_ids += [pad_token_label_id] * padding_length # 保证我的input_id, input_mask, segment_ids, 和label_ids都是一致的 assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s", example.guid) logger.info("tokens: %s", " ".join([str(x) for x in tokens])) logger.info("input_ids: %s", " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) features.append( InputFeatures(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label=label_ids) ) # note segement_ids has 1 in the front? but we don't use it right now # fetures to dataset if return_tensors is None: return features elif return_tensors == "tf": if not is_tf_available(): raise RuntimeError( "return_tensors set to 'tf' but TensorFlow 2.0 can't be imported" ) import tensorflow as tf def gen(): for ex in features: yield ({ "input_ids": ex.input_ids, "attention_mask": ex.attention_mask }, ex.label) dataset = tf.data.Dataset.from_generator( gen, ({ "input_ids": tf.int32, "attention_mask": tf.int32 }, tf.int64), ({ "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]) }, tf.TensorShape([])), ) return dataset elif return_tensors == "pt": if not is_torch_available(): raise RuntimeError( "return_tensors set to 'pt' but PyTorch can't be imported") import torch from torch.utils.data import TensorDataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor( [f.token_type_ids for f in features], dtype=torch.long) if self.mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif self.mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset else: raise ValueError("return_tensors should be one of 'tf' or 'pt'")