Beispiel #1
0
    def __new__(mcs, name, bases, dct):
        def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class):
            @skipIf(tiny_config is None, "TinyConfig does not exist")
            @skipIf(checkpoint is None, "checkpoint does not exist")
            def test(self):
                model = ModelClass(tiny_config)
                if hasattr(model, "eval"):
                    model = model.eval()
                try:
                    tokenizer = get_tiny_tokenizer_from_checkpoint(checkpoint)
                    tokenizer.model_max_length = model.config.max_position_embeddings
                # Rust Panic exception are NOT Exception subclass
                # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
                # provide some default tokenizer and hope for the best.
                except:  # noqa: E722
                    logger.warning(
                        f"Tokenizer cannot be created from checkpoint {checkpoint}"
                    )
                    tokenizer = get_tiny_tokenizer_from_checkpoint("gpt2")
                    tokenizer.model_max_length = model.config.max_position_embeddings
                self.run_pipeline_test(model, tokenizer)

            return test

        mapping = dct.get("model_mapping", {})
        if mapping:
            for configuration, model_architecture in mapping.items():
                checkpoint = get_checkpoint_from_architecture(
                    model_architecture)
                tiny_config = get_tiny_config_from_class(configuration)
                tokenizer_classes = TOKENIZER_MAPPING.get(configuration, [])
                for tokenizer_class in tokenizer_classes:
                    if tokenizer_class is not None and tokenizer_class.__name__.endswith(
                            "Fast"):
                        test_name = f"test_pt_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_class.__name__}"
                        dct[test_name] = gen_test(model_architecture,
                                                  checkpoint, tiny_config,
                                                  tokenizer_class)

        tf_mapping = dct.get("tf_model_mapping", {})
        if tf_mapping:
            for configuration, model_architecture in tf_mapping.items():
                checkpoint = get_checkpoint_from_architecture(
                    model_architecture)
                tiny_config = get_tiny_config_from_class(configuration)
                tokenizer_classes = TOKENIZER_MAPPING.get(configuration, [])
                for tokenizer_class in tokenizer_classes:
                    if tokenizer_class is not None and tokenizer_class.__name__.endswith(
                            "Fast"):
                        test_name = f"test_tf_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_class.__name__}"
                        dct[test_name] = gen_test(model_architecture,
                                                  checkpoint, tiny_config,
                                                  tokenizer_class)

        return type.__new__(mcs, name, bases, dct)
Beispiel #2
0
    def __new__(mcs, name, bases, dct):
        def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class):
            @skipIf(tiny_config is None, "TinyConfig does not exist")
            @skipIf(checkpoint is None, "checkpoint does not exist")
            def test(self):
                model = ModelClass(tiny_config)
                if hasattr(model, "eval"):
                    model = model.eval()
                try:
                    tokenizer = get_tiny_tokenizer_from_checkpoint(checkpoint)
                    if hasattr(model.config, "max_position_embeddings"):
                        tokenizer.model_max_length = model.config.max_position_embeddings
                # Rust Panic exception are NOT Exception subclass
                # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
                # provide some default tokenizer and hope for the best.
                except:  # noqa: E722
                    self.skipTest(
                        f"Ignoring {ModelClass}, cannot create a simple tokenizer"
                    )
                self.run_pipeline_test(model, tokenizer)

            return test

        for prefix, key in [("pt", "model_mapping"),
                            ("tf", "tf_model_mapping")]:
            mapping = dct.get(key, {})
            if mapping:
                for configuration, model_architectures in mapping.items():
                    if not isinstance(model_architectures, tuple):
                        model_architectures = (model_architectures, )

                    for model_architecture in model_architectures:
                        checkpoint = get_checkpoint_from_architecture(
                            model_architecture)
                        tiny_config = get_tiny_config_from_class(configuration)
                        tokenizer_classes = TOKENIZER_MAPPING.get(
                            configuration, [])
                        for tokenizer_class in tokenizer_classes:
                            if tokenizer_class is not None and tokenizer_class.__name__.endswith(
                                    "Fast"):
                                test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_class.__name__}"
                                dct[test_name] = gen_test(
                                    model_architecture, checkpoint,
                                    tiny_config, tokenizer_class)

        return type.__new__(mcs, name, bases, dct)
Beispiel #3
0
    def __new__(mcs, name, bases, dct):
        def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class,
                     feature_extractor_class):
            @skipIf(tiny_config is None, "TinyConfig does not exist")
            @skipIf(checkpoint is None, "checkpoint does not exist")
            def test(self):
                if ModelClass.__name__.endswith("ForCausalLM"):
                    tiny_config.is_encoder_decoder = False
                    if hasattr(tiny_config, "encoder_no_repeat_ngram_size"):
                        # specific for blenderbot which supports both decoder-only
                        # encoder/decoder but the test config  only reflects
                        # encoder/decoder arch
                        tiny_config.encoder_no_repeat_ngram_size = 0
                if ModelClass.__name__.endswith("WithLMHead"):
                    tiny_config.is_decoder = True
                try:
                    model = ModelClass(tiny_config)
                except ImportError as e:
                    self.skipTest(
                        f"Cannot run with {tiny_config} as the model requires a library that isn't installed: {e}"
                    )
                if hasattr(model, "eval"):
                    model = model.eval()
                if tokenizer_class is not None:
                    try:
                        tokenizer = get_tiny_tokenizer_from_checkpoint(
                            checkpoint)
                        # XLNet actually defines it as -1.
                        if isinstance(model.config,
                                      (RobertaConfig, IBertConfig)):
                            tokenizer.model_max_length = model.config.max_position_embeddings - 2
                        elif (hasattr(model.config, "max_position_embeddings")
                              and model.config.max_position_embeddings > 0):
                            tokenizer.model_max_length = model.config.max_position_embeddings
                    # Rust Panic exception are NOT Exception subclass
                    # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
                    # provide some default tokenizer and hope for the best.
                    except:  # noqa: E722
                        self.skipTest(
                            f"Ignoring {ModelClass}, cannot create a simple tokenizer"
                        )
                else:
                    tokenizer = None
                feature_extractor = get_tiny_feature_extractor_from_checkpoint(
                    checkpoint, tiny_config)
                pipeline, examples = self.get_test_pipeline(
                    model, tokenizer, feature_extractor)
                if pipeline is None:
                    # The test can disable itself, but it should be very marginal
                    # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
                    return
                self.run_pipeline_test(pipeline, examples)

                def run_batch_test(pipeline, examples):
                    # Need to copy because `Conversation` are stateful
                    if pipeline.tokenizer is not None and pipeline.tokenizer.pad_token_id is None:
                        return  # No batching for this and it's OK

                    # 10 examples with batch size 4 means there needs to be a unfinished batch
                    # which is important for the unbatcher
                    dataset = [
                        copy.deepcopy(random.choice(examples))
                        for i in range(10)
                    ]

                    for item in pipeline(dataset, batch_size=4):
                        pass

                run_batch_test(pipeline, examples)

            return test

        for prefix, key in [("pt", "model_mapping"),
                            ("tf", "tf_model_mapping")]:
            mapping = dct.get(key, {})
            if mapping:
                for configuration, model_architectures in mapping.items():
                    if not isinstance(model_architectures, tuple):
                        model_architectures = (model_architectures, )

                    for model_architecture in model_architectures:
                        checkpoint = get_checkpoint_from_architecture(
                            model_architecture)
                        tiny_config = get_tiny_config_from_class(configuration)
                        tokenizer_classes = TOKENIZER_MAPPING.get(
                            configuration, [])
                        feature_extractor_class = FEATURE_EXTRACTOR_MAPPING.get(
                            configuration, None)
                        feature_extractor_name = (
                            feature_extractor_class.__name__ if
                            feature_extractor_class else "nofeature_extractor")
                        if not tokenizer_classes:
                            # We need to test even if there are no tokenizers.
                            tokenizer_classes = [None]
                        for tokenizer_class in tokenizer_classes:
                            if tokenizer_class is not None:
                                tokenizer_name = tokenizer_class.__name__
                            else:
                                tokenizer_name = "notokenizer"

                            test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}"

                            if tokenizer_class is not None or feature_extractor_class is not None:
                                dct[test_name] = gen_test(
                                    model_architecture,
                                    checkpoint,
                                    tiny_config,
                                    tokenizer_class,
                                    feature_extractor_class,
                                )

        @abstractmethod
        def inner(self):
            raise NotImplementedError("Not implemented test")

        # Force these 2 methods to exist
        dct["test_small_model_pt"] = dct.get("test_small_model_pt", inner)
        dct["test_small_model_tf"] = dct.get("test_small_model_tf", inner)

        return type.__new__(mcs, name, bases, dct)
    def __new__(mcs, name, bases, dct):
        def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class,
                     feature_extractor_class):
            @skipIf(tiny_config is None, "TinyConfig does not exist")
            @skipIf(checkpoint is None, "checkpoint does not exist")
            def test(self):
                if ModelClass.__name__.endswith("ForCausalLM"):
                    tiny_config.is_encoder_decoder = False
                if ModelClass.__name__.endswith("WithLMHead"):
                    tiny_config.is_decoder = True
                model = ModelClass(tiny_config)
                if hasattr(model, "eval"):
                    model = model.eval()
                if tokenizer_class is not None:
                    try:
                        tokenizer = get_tiny_tokenizer_from_checkpoint(
                            checkpoint)
                        # XLNet actually defines it as -1.
                        if (hasattr(model.config, "max_position_embeddings")
                                and model.config.max_position_embeddings > 0):
                            tokenizer.model_max_length = model.config.max_position_embeddings
                    # Rust Panic exception are NOT Exception subclass
                    # Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
                    # provide some default tokenizer and hope for the best.
                    except:  # noqa: E722
                        self.skipTest(
                            f"Ignoring {ModelClass}, cannot create a simple tokenizer"
                        )
                else:
                    tokenizer = None
                feature_extractor = get_tiny_feature_extractor_from_checkpoint(
                    checkpoint, tiny_config)
                self.run_pipeline_test(model, tokenizer, feature_extractor)

            return test

        for prefix, key in [("pt", "model_mapping"),
                            ("tf", "tf_model_mapping")]:
            mapping = dct.get(key, {})
            if mapping:
                for configuration, model_architectures in mapping.items():
                    if not isinstance(model_architectures, tuple):
                        model_architectures = (model_architectures, )

                    for model_architecture in model_architectures:
                        checkpoint = get_checkpoint_from_architecture(
                            model_architecture)
                        tiny_config = get_tiny_config_from_class(configuration)
                        tokenizer_classes = TOKENIZER_MAPPING.get(
                            configuration, [])
                        feature_extractor_class = FEATURE_EXTRACTOR_MAPPING.get(
                            configuration, None)
                        feature_extractor_name = (
                            feature_extractor_class.__name__ if
                            feature_extractor_class else "nofeature_extractor")
                        if not tokenizer_classes:
                            # We need to test even if there are no tokenizers.
                            tokenizer_classes = [None]
                        for tokenizer_class in tokenizer_classes:
                            if tokenizer_class is not None:
                                tokenizer_name = tokenizer_class.__name__
                            else:
                                tokenizer_name = "notokenizer"

                            test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}"

                            if tokenizer_class is not None or feature_extractor_class is not None:
                                dct[test_name] = gen_test(
                                    model_architecture,
                                    checkpoint,
                                    tiny_config,
                                    tokenizer_class,
                                    feature_extractor_class,
                                )

        @abstractmethod
        def inner(self):
            raise NotImplementedError("Not implemented test")

        # Force these 2 methods to exist
        dct["test_small_model_pt"] = dct.get("test_small_model_pt", inner)
        dct["test_small_model_tf"] = dct.get("test_small_model_tf", inner)

        return type.__new__(mcs, name, bases, dct)