Ejemplo n.º 1
0
    def test_model_card_from_and_save_pretrained(self):
        model_card_first = ModelCard.from_dict(self.inputs_dict)

        with tempfile.TemporaryDirectory() as tmpdirname:
            model_card_first.save_pretrained(tmpdirname)
            model_card_second = ModelCard.from_pretrained(tmpdirname)

        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
Ejemplo n.º 2
0
    def test_model_card_to_json_file(self):
        model_card_first = ModelCard.from_dict(self.inputs_dict)

        with tempfile.TemporaryDirectory() as tmpdirname:
            filename = os.path.join(tmpdirname, "modelcard.json")
            model_card_first.to_json_file(filename)
            model_card_second = ModelCard.from_json_file(filename)

        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
Ejemplo n.º 3
0
 def test_model_card_common_properties(self):
     modelcard = ModelCard.from_dict(self.inputs_dict)
     self.assertTrue(hasattr(modelcard, "model_details"))
     self.assertTrue(hasattr(modelcard, "intended_use"))
     self.assertTrue(hasattr(modelcard, "factors"))
     self.assertTrue(hasattr(modelcard, "metrics"))
     self.assertTrue(hasattr(modelcard, "evaluation_data"))
     self.assertTrue(hasattr(modelcard, "training_data"))
     self.assertTrue(hasattr(modelcard, "quantitative_analyses"))
     self.assertTrue(hasattr(modelcard, "ethical_considerations"))
     self.assertTrue(hasattr(modelcard, "caveats_and_recommendations"))
Ejemplo n.º 4
0
 def test_model_card_common_properties(self):
     modelcard = ModelCard.from_dict(self.inputs_dict)
     self.assertTrue(hasattr(modelcard, 'model_details'))
     self.assertTrue(hasattr(modelcard, 'intended_use'))
     self.assertTrue(hasattr(modelcard, 'factors'))
     self.assertTrue(hasattr(modelcard, 'metrics'))
     self.assertTrue(hasattr(modelcard, 'evaluation_data'))
     self.assertTrue(hasattr(modelcard, 'training_data'))
     self.assertTrue(hasattr(modelcard, 'quantitative_analyses'))
     self.assertTrue(hasattr(modelcard, 'ethical_considerations'))
     self.assertTrue(hasattr(modelcard, 'caveats_and_recommendations'))
Ejemplo n.º 5
0
def pipeline(task: str,
             model: Optional = None,
             config: Optional[Union[str, PretrainedConfig]] = None,
             tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
             modelcard: Optional[Union[str, ModelCard]] = None,
             device=torch.device("cpu"),
             **kwargs) -> Pipeline:
    """
    Utility factory method to build a pipeline.
    Pipeline are made of:
        A Tokenizer instance in charge of mapping raw textual input to token
        A Model instance
        Some (optional) post processing for enhancing model's output
    Examples:
        pipeline('sentiment-analysis')
    """
    # Register all the supported task here
    SUPPORTED_TASKS = {
        "sentiment-analysis": {
            "impl": TextClassificationPipelineMod,
            "pt":
            AutoModelForSequenceClassification,  # if is_torch_available() else None,
            "default": {
                "model": {
                    "pt": "distilbert-base-uncased-finetuned-sst-2-english",
                },
                "config": "distilbert-base-uncased-finetuned-sst-2-english",
                "tokenizer": "distilbert-base-uncased",
            },
        },
    }

    # Retrieve the task
    if task not in SUPPORTED_TASKS:
        raise KeyError("Unknown task {}, available tasks are {}".format(
            task, list(SUPPORTED_TASKS.keys())))

    framework = "pt"  #get_framework(model)

    targeted_task = SUPPORTED_TASKS[task]
    task, model_class = targeted_task["impl"], targeted_task[framework]

    # Use default model/config/tokenizer for the task if no model is provided
    if model is None:
        models, config, tokenizer = tuple(targeted_task["default"].values())
        model = models[framework]

    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
        if isinstance(model,
                      str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
            tokenizer = model
        elif isinstance(config,
                        str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
            tokenizer = config
        else:
            # Impossible to guest what is the right tokenizer here
            raise Exception(
                "Impossible to guess which tokenizer to use. "
                "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
            )

    # Try to infer modelcard from model or config name (if provided as str)
    if modelcard is None:
        # Try to fallback on one of the provided string for model or config (will replace the suffix)
        if isinstance(model, str):
            modelcard = model
        elif isinstance(config, str):
            modelcard = config

    # Instantiate tokenizer if needed
    if isinstance(tokenizer, str):
        tokenizer = AutoTokenizer.from_pretrained(tokenizer)

    # Instantiate config if needed
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(config)

    # Instantiate modelcard if needed
    if isinstance(modelcard, str):
        modelcard = ModelCard.from_pretrained(modelcard)

    # Instantiate model if needed
    if isinstance(model, str):
        # Handle transparent TF/PT model conversion
        model_kwargs = {}
        if framework == "pt" and model.endswith(".h5"):
            model_kwargs["from_tf"] = True
            logger.warning(
                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
                "Trying to load the model with PyTorch.")

        model = model_class.from_pretrained(model,
                                            config=config,
                                            **model_kwargs)
        model = model.to(device)
    model.device = device
    return task(model=model,
                tokenizer=tokenizer,
                modelcard=modelcard,
                framework=framework,
                **kwargs)
Ejemplo n.º 6
0
 def test_model_card_to_json_string(self):
     modelcard = ModelCard.from_dict(self.inputs_dict)
     obj = json.loads(modelcard.to_json_string())
     for key, value in self.inputs_dict.items():
         self.assertEqual(obj[key], value)
Ejemplo n.º 7
0
def pipeline(task: str,
             model: Optional = None,
             config: Optional[Union[str, PretrainedConfig]] = None,
             tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
             framework: Optional[str] = None,
             **kwargs) -> Pipeline:
    """
    Utility factory method to build a :class:`~transformers.Pipeline`.

    Pipelines are made of:

        - A :doc:`tokenizer <tokenizer>` in charge of mapping raw textual input to token.
        - A :doc:`model <model>` to make predictions from the inputs.
        - Some (optional) post processing for enhancing model's output.

    Args:
        task (:obj:`str`):
            The task defining which pipeline will be returned. Currently accepted tasks are:

            - :obj:`"feature-extraction"`: will return a :class:`~transformers.FeatureExtractionPipeline`.
            - :obj:`"sentiment-analysis"`: will return a :class:`~transformers.TextClassificationPipeline`.
            - :obj:`"ner"`: will return a :class:`~transformers.TokenClassificationPipeline`.
            - :obj:`"question-answering"`: will return a :class:`~transformers.QuestionAnsweringPipeline`.
            - :obj:`"fill-mask"`: will return a :class:`~transformers.FillMaskPipeline`.
            - :obj:`"summarization"`: will return a :class:`~transformers.SummarizationPipeline`.
            - :obj:`"translation_xx_to_yy"`: will return a :class:`~transformers.TranslationPipeline`.
            - :obj:`"text-generation"`: will return a :class:`~transformers.TextGenerationPipeline`.
            - :obj:`"conversation"`: will return a :class:`~transformers.ConversationalPipeline`.
        model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`):
            The model that will be used by the pipeline to make predictions. This can be a model identifier or an
            actual instance of a pretrained model inheriting from :class:`~transformers.PreTrainedModel` (for PyTorch)
            or :class:`~transformers.TFPreTrainedModel` (for TensorFlow).

            If not provided, the default for the :obj:`task` will be loaded.
        config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`):
            The configuration that will be used by the pipeline to instantiate the model. This can be a model
            identifier or an actual pretrained model configuration inheriting from
            :class:`~transformers.PretrainedConfig`.

            If not provided, the default for the :obj:`task` will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
            identifier or an actual pretrained tokenizer inheriting from
            :class:`~transformers.PreTrainedTokenizer`.

            If not provided, the default for the :obj:`task` will be loaded.
        framework (:obj:`str`, `optional`):
            The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
            must be installed.

            If no framework is specified, will default to the one currently installed. If no framework is specified
            and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no
            model is provided.
        kwargs:
            Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
            corresponding pipeline class for possible values).

    Returns:
        :class:`~transformers.Pipeline`: A suitable pipeline for the task.

    Examples::

        >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

        >>> # Sentiment analysis pipeline
        >>> pipeline('sentiment-analysis')

        >>> # Question answering pipeline, specifying the checkpoint identifier
        >>> pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')

        >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
        >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        >>> pipeline('ner', model=model, tokenizer=tokenizer)
    """

    framework = 'pt'

    task_class, model_class = QuestionAnsweringPipeline, AutoModelForQuestionAnswering

    # Use default model/config/tokenizer for the task if no model is provided
    if model is None:
        model = 'distilbert-base-cased-distilled-squad'

    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
        if isinstance(model, str):
            tokenizer = model
        elif isinstance(config, str):
            tokenizer = config
        else:
            # Impossible to guest what is the right tokenizer here
            raise Exception(
                "Impossible to guess which tokenizer to use. "
                "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
            )

    modelcard = None
    # Try to infer modelcard from model or config name (if provided as str)
    if isinstance(model, str):
        modelcard = model
    elif isinstance(config, str):
        modelcard = config

    # Instantiate tokenizer if needed
    if isinstance(tokenizer, (str, tuple)):
        if isinstance(tokenizer, tuple):
            # For tuple we have (tokenizer name, {kwargs})
            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0],
                                                      **tokenizer[1])
        else:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)

    # Instantiate config if needed
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(config)

    # Instantiate modelcard if needed
    if isinstance(modelcard, str):
        modelcard = ModelCard.from_pretrained(modelcard)

    # Instantiate model if needed
    if isinstance(model, str):
        # Handle transparent TF/PT model conversion
        model_kwargs = {}
        if framework == "pt" and model.endswith(".h5"):
            model_kwargs["from_tf"] = True
            logger.warning(
                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
                "Trying to load the model with PyTorch.")
        elif framework == "tf" and model.endswith(".bin"):
            model_kwargs["from_pt"] = True
            logger.warning(
                "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
                "Trying to load the model with Tensorflow.")
        model = model_class.from_pretrained(model,
                                            config=config,
                                            **model_kwargs)

    return task_class(model=model,
                      tokenizer=tokenizer,
                      modelcard=modelcard,
                      framework=framework,
                      task=task,
                      **kwargs)