def model_prepare(args: Config):
    config = AutoConfig.from_pretrained(args.base_model_id,
                                        num_labels=args.num_labels)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.base_model_id, config=config)
    tokenizer = AutoTokenizer.from_pretrained(args.base_model_id)
    return model, tokenizer
Esempio n. 2
0
    def __init__(self, config, *args, **kwargs):

        tokenizer_config = config.tokenizer_config
        self._tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_config.type, **tokenizer_config.params)

        self._max_seq_length = config.max_seq_length
        self._probability = getattr(config, "mask_probability", 0.15)
Esempio n. 3
0
def main():
    parser = ArgumentParser("Hugging Face ONNX Exporter tool")
    parser.add_argument("-m",
                        "--model",
                        type=str,
                        required=True,
                        help="Model's name of path on disk to load.")
    parser.add_argument(
        "--features",
        choices=["default"],
        default="default",
        help="Export the model with some additional features.",
    )
    parser.add_argument(
        "--opset",
        type=int,
        default=12,
        help="ONNX opset version to export the model with (default 12).")
    parser.add_argument(
        "--atol",
        type=float,
        default=1e-4,
        help="Absolute difference tolerence when validating the model.")
    parser.add_argument(
        "output",
        type=Path,
        help="Path indicating where to store generated ONNX model.")

    # Retrieve CLI arguments
    args = parser.parse_args()
    args.output = args.output if args.output.is_file(
    ) else args.output.joinpath("model.onnx")

    if not args.output.parent.exists():
        args.output.parent.mkdir(parents=True)

    # Allocate the model
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    model = get_model_from_features(args.features, args.model)
    model_kind, model_onnx_config = check_supported_model_or_raise(
        model, features=args.features)
    onnx_config = model_onnx_config(model.config)

    # Ensure the requested opset is sufficient
    if args.opset < onnx_config.default_onnx_opset:
        raise ValueError(
            f"Opset {args.opset} is not sufficient to export {model_kind}. "
            f"At least  {onnx_config.default_onnx_opset} is required.")

    onnx_inputs, onnx_outputs = export(tokenizer, model, onnx_config,
                                       args.opset, args.output)

    validate_model_outputs(onnx_config, tokenizer, model, args.output,
                           onnx_outputs, args.atol)
    logger.info(f"All good, model saved at: {args.output.as_posix()}")
Esempio n. 4
0
    def __init__(self, config, *args, **kwargs):
        # https://huggingface.co/transformers/model_doc/xlmroberta.html
        # roberta is with different tokenization of above default (bert)
        tokenizer_config = config.tokenizer_config
        self._tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_config.type, **tokenizer_config.params)

        self._CLS_TOKEN = self._tokenizer.bos_token  # <s>
        self._SEP_TOKEN = self._tokenizer.sep_token  # </s>
        self._MASK_TOKEN = self._tokenizer.mask_token  # <mask>
        self._PAD_TOKEN_ID = self._tokenizer.pad_token_id  # 1

        self._max_seq_length = config.max_seq_length
        self._probability = getattr(config, "mask_probability", 0.15)
def run_predict_with_pipeline(args: Config):
    """"""
    from transformers import (pipeline, AutoConfig, AutoTokenizer,
                              AutoModelForSequenceClassification,
                              TextClassificationPipeline)

    config = AutoConfig.from_pretrained(args.output_dir,
                                        num_labels=args.num_labels,
                                        local_files_only=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.output_dir, config=config, local_files_only=True)
    tokenizer = AutoTokenizer.from_pretrained(args.output_dir,
                                              local_files_only=True)
    classifier = pipeline('text-classification',
                          model=model,
                          tokenizer=tokenizer,
                          return_all_scores=True)
    # classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
    ret = classifier('my_test_sentence_1')
    print(ret)
Esempio n. 6
0
    def __init__(
        self,
        model_name_or_path: Optional[str] = None,
        num_layers: Optional[int] = None,
        all_layers: bool = False,
        model: Optional[torch.nn.Module] = None,
        user_tokenizer: Optional[Any] = None,
        user_forward_fn: Callable[[torch.nn.Module, Dict[str, torch.Tensor]], torch.Tensor] = None,
        verbose: bool = False,
        idf: bool = False,
        device: Optional[Union[str, torch.device]] = None,
        max_length: int = 512,
        batch_size: int = 64,
        num_threads: int = 4,
        return_hash: bool = False,
        lang: str = "en",
        rescale_with_baseline: bool = False,
        baseline_path: Optional[str] = None,
        baseline_url: Optional[str] = None,
        compute_on_step: Optional[bool] = None,
        **kwargs: Dict[str, Any],
    ):
        super().__init__(compute_on_step=compute_on_step, **kwargs)
        self.model_name_or_path = model_name_or_path or _DEFAULT_MODEL
        self.num_layers = num_layers
        self.all_layers = all_layers
        self.model = model
        self.user_forward_fn = user_forward_fn
        self.verbose = verbose
        self.idf = idf
        self.embedding_device = device
        self.max_length = max_length
        self.batch_size = batch_size
        self.num_threads = num_threads
        self.return_hash = return_hash
        self.lang = lang
        self.rescale_with_baseline = rescale_with_baseline
        self.baseline_path = baseline_path
        self.baseline_url = baseline_url
        self.preds: Dict[str, List[torch.Tensor]] = {"input_ids": [], "attention_mask": []}
        self.target: Dict[str, List[torch.Tensor]] = {"input_ids": [], "attention_mask": []}

        if user_tokenizer:
            self.tokenizer = user_tokenizer
            self.user_tokenizer = True
        else:
            if not _TRANSFORMERS_AUTO_AVAILABLE:
                raise ModuleNotFoundError(
                    "`BERTScore` metric with default tokenizers requires `transformers` package be installed."
                    " Either install with `pip install transformers>=4.0` or `pip install torchmetrics[text]`."
                )
            if model_name_or_path is None:
                warn(
                    "The argument `model_name_or_path` was not specified while it is required when the default"
                    " `transformers` model is used."
                    f" It will use the default recommended model - {_DEFAULT_MODEL!r}."
                )
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
            self.user_tokenizer = False

        self.add_state("preds_input_ids", [], dist_reduce_fx="cat")
        self.add_state("preds_attention_mask", [], dist_reduce_fx="cat")
        self.add_state("target_input_ids", [], dist_reduce_fx="cat")
        self.add_state("target_attention_mask", [], dist_reduce_fx="cat")
Esempio n. 7
0
def main():
    parser = ArgumentParser("Hugging Face Transformers ONNX exporter")
    parser.add_argument(
        "-m",
        "--model",
        type=str,
        required=True,
        help="Model ID on huggingface.co or path on disk to load model from.")
    parser.add_argument(
        "--feature",
        choices=list(FeaturesManager.AVAILABLE_FEATURES),
        default="default",
        help="The type of features to export the model with.",
    )
    parser.add_argument("--opset",
                        type=int,
                        default=None,
                        help="ONNX opset version to export the model with.")
    parser.add_argument(
        "--atol",
        type=float,
        default=None,
        help="Absolute difference tolerence when validating the model.")
    parser.add_argument(
        "output",
        type=Path,
        help="Path indicating where to store generated ONNX model.")

    # Retrieve CLI arguments
    args = parser.parse_args()
    args.output = args.output if args.output.is_file(
    ) else args.output.joinpath("model.onnx")

    if not args.output.parent.exists():
        args.output.parent.mkdir(parents=True)

    # Allocate the model
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    model = FeaturesManager.get_model_from_feature(args.feature, args.model)
    model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(
        model, feature=args.feature)
    onnx_config = model_onnx_config(model.config)

    # Ensure the requested opset is sufficient
    if args.opset is None:
        args.opset = onnx_config.default_onnx_opset

    if args.opset < onnx_config.default_onnx_opset:
        raise ValueError(
            f"Opset {args.opset} is not sufficient to export {model_kind}. "
            f"At least  {onnx_config.default_onnx_opset} is required.")

    onnx_inputs, onnx_outputs = export(tokenizer, model, onnx_config,
                                       args.opset, args.output)

    if args.atol is None:
        args.atol = onnx_config.atol_for_validation

    validate_model_outputs(onnx_config, tokenizer, model, args.output,
                           onnx_outputs, args.atol)
    logger.info(f"All good, model saved at: {args.output.as_posix()}")
Esempio n. 8
0
def bert_score(
    preds: Union[List[str], Dict[str, Tensor]],
    target: Union[List[str], Dict[str, Tensor]],
    model_name_or_path: Optional[str] = None,
    num_layers: Optional[int] = None,
    all_layers: bool = False,
    model: Optional[torch.nn.Module] = None,
    user_tokenizer: Any = None,
    user_forward_fn: Callable[[torch.nn.Module, Dict[str, Tensor]],
                              Tensor] = None,
    verbose: bool = False,
    idf: bool = False,
    device: Optional[Union[str, torch.device]] = None,
    max_length: int = 512,
    batch_size: int = 64,
    num_threads: int = 4,
    return_hash: bool = False,
    lang: str = "en",
    rescale_with_baseline: bool = False,
    baseline_path: Optional[str] = None,
    baseline_url: Optional[str] = None,
) -> Dict[str, Union[List[float], str]]:
    """`Bert_score Evaluating Text Generation`_ leverages the pre-trained contextual embeddings from BERT and
    matches words in candidate and reference sentences by cosine similarity.

    It has been shown to correlate with human judgment on sentence-level and system-level evaluation.
    Moreover, BERTScore computes precision, recall, and F1 measure, which can be useful for evaluating different
    language generation tasks.

    This implemenation follows the original implementation from `BERT_score`_.

    Args:
        preds: Either an iterable of predicted sentences or a ``Dict[input_ids, attention_mask]``.
        target: Either an iterable of target sentences or a  ``Dict[input_ids, attention_mask]``.
        model_name_or_path: A name or a model path used to load ``transformers`` pretrained model.
        num_layers: A layer of representation to use.
        all_layers:
            An indication of whether the representation from all model's layers should be used.
            If ``all_layers = True``, the argument ``num_layers`` is ignored.
        model: A user's own model.
        user_tokenizer:
            A user's own tokenizer used with the own model. This must be an instance with the ``__call__`` method.
            This method must take an iterable of sentences (``List[str]``) and must return a python dictionary
            containing ``"input_ids"`` and ``"attention_mask"`` represented by ``torch.Tensor``.
            It is up to the user's model of whether ``"input_ids"`` is a ``torch.Tensor`` of input ids or embedding
            vectors. his tokenizer must prepend an equivalent of ``[CLS]`` token and append an equivalent of ``[SEP]``
            token as `transformers` tokenizer does.
        user_forward_fn:
            A user's own forward function used in a combination with ``user_model``.
            This function must take ``user_model`` and a python dictionary of containing ``"input_ids"``
            and ``"attention_mask"`` represented by ``torch.Tensor`` as an input and return the model's output
            represented by the single ``torch.Tensor``.
        verbose: An indication of whether a progress bar to be displayed during the embeddings' calculation.
        idf: An indication of whether normalization using inverse document frequencies should be used.
        device: A device to be used for calculation.
        max_length: A maximum length of input sequences. Sequences longer than ``max_length`` are to be trimmed.
        batch_size: A batch size used for model processing.
        num_threads: A number of threads to use for a dataloader.
        return_hash: An indication of whether the correspodning ``hash_code`` should be returned.
        lang: A language of input sentences. It is used when the scores are rescaled with a baseline.
        rescale_with_baseline:
            An indication of whether bertscore should be rescaled with a pre-computed baseline.
            When a pretrained model from ``transformers`` model is used, the corresponding baseline is downloaded
            from the original ``bert-score`` package from `BERT_score`_ if available.
            In other cases, please specify a path to the baseline csv/tsv file, which must follow the formatting
            of the files from `BERT_score`_
        baseline_path: A path to the user's own local csv/tsv file with the baseline scale.
        baseline_url: A url path to the user's own  csv/tsv file with the baseline scale.

    Returns:
        Python dictionary containing the keys ``precision``, ``recall`` and ``f1`` with corresponding values.

    Raises:
        ValueError:
            If ``len(preds) != len(target)``.
        ModuleNotFoundError:
            If `tqdm` package is required and not installed.
        ModuleNotFoundError:
            If ``transformers`` package is required and not installed.
        ValueError:
            If ``num_layer`` is larger than the number of the model layers.
        ValueError:
            If invalid input is provided.

    Example:
        >>> from torchmetrics.functional.text.bert import bert_score
        >>> preds = ["hello there", "general kenobi"]
        >>> target = ["hello there", "master kenobi"]
        >>> score = bert_score(preds, target)
        >>> from pprint import pprint
        >>> rounded_score = {k: [round(v, 3) for v in vv] for k, vv in score.items()}
        >>> pprint(rounded_score)
        {'f1': [1.0, 0.996], 'precision': [1.0, 0.996], 'recall': [1.0, 0.996]}
    """
    if len(preds) != len(target):
        raise ValueError(
            "Number of predicted and reference sententes must be the same!")

    if verbose and (not _TQDM_AVAILABLE):
        raise ModuleNotFoundError(
            "An argument `verbose = True` requires `tqdm` package be installed. Install with `pip install tqdm`."
        )

    if model is None:
        if not _TRANSFORMERS_AUTO_AVAILABLE:
            raise ModuleNotFoundError(
                "`bert_score` metric with default models requires `transformers` package be installed."
                " Either install with `pip install transformers>=4.0` or `pip install torchmetrics[text]`."
            )
        if model_name_or_path is None:
            warn(
                "The argument `model_name_or_path` was not specified while it is required when default"
                " `transformers` model are used."
                f"It is, therefore, used the default recommended model - {_DEFAULT_MODEL}."
            )
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path
                                                  or _DEFAULT_MODEL)
        model = AutoModel.from_pretrained(model_name_or_path or _DEFAULT_MODEL)
    else:
        tokenizer = user_tokenizer
    model.eval()
    model.to(device)

    try:
        if num_layers and num_layers > model.config.num_hidden_layers:  # type: ignore
            raise ValueError(
                f"num_layers={num_layers} is forbidden for {model_name_or_path}. "  # type: ignore
                f"Please use num_layers <= {model.config.num_hidden_layers}"  # type: ignore
            )
    except AttributeError:
        warn(
            "It was not possible to retrieve the parameter `num_layers` from the model specification."
        )

    _are_empty_lists = all(
        isinstance(text, list) and len(text) == 0 for text in (preds, target))
    _are_valid_lists = all(
        isinstance(text, list) and len(text) > 0 and isinstance(text[0], str)
        for text in (preds, target))
    _are_valid_tensors = all(
        isinstance(text, dict) and isinstance(text["input_ids"], Tensor)
        for text in (preds, target))
    if _are_empty_lists:
        warn("Predictions and references are empty.")
        output_dict: Dict[str, Union[List[float], str]] = {
            "precision": [0.0],
            "recall": [0.0],
            "f1": [0.0],
        }
        if return_hash:
            output_dict.update(
                {"hash": _get_hash(model_name_or_path, num_layers, idf)})
        return output_dict

    # Load baselines if needed
    baseline = _load_baseline(lang, model_name_or_path, baseline_path,
                              baseline_url) if rescale_with_baseline else None

    # We ignore mypy typing below as the proper typing is ensured by conditions above, only mypy cannot infer that.
    if _are_valid_lists:
        target_dataset = TextDataset(target, tokenizer, max_length,
                                     idf=idf)  # type: ignore
        preds_dataset = TextDataset(
            preds,  # type: ignore
            tokenizer,
            max_length,
            idf=idf,
            tokens_idf=target_dataset.tokens_idf,
        )
    elif _are_valid_tensors:
        target_dataset = TokenizedDataset(**target, idf=idf)  # type: ignore
        preds_dataset = TokenizedDataset(
            **preds, idf=idf,
            tokens_idf=target_dataset.tokens_idf)  # type: ignore
    else:
        raise ValueError("Invalid input provided.")

    target_loader = DataLoader(target_dataset,
                               batch_size=batch_size,
                               num_workers=num_threads)
    preds_loader = DataLoader(preds_dataset,
                              batch_size=batch_size,
                              num_workers=num_threads)

    target_embeddings, target_idf_scale = _get_embeddings_and_idf_scale(
        target_loader, target_dataset.max_length, model, device, num_layers,
        all_layers, idf, verbose, user_forward_fn)
    preds_embeddings, preds_idf_scale = _get_embeddings_and_idf_scale(
        preds_loader, preds_dataset.max_length, model, device, num_layers,
        all_layers, idf, verbose, user_forward_fn)

    precision, recall, f1_score = _get_precision_recall_f1(
        preds_embeddings, target_embeddings, preds_idf_scale, target_idf_scale)

    if baseline is not None:
        precision, recall, f1_score = _rescale_metrics_with_baseline(
            precision, recall, f1_score, baseline, num_layers, all_layers)

    output_dict = {
        "precision": precision.tolist(),
        "recall": recall.tolist(),
        "f1": f1_score.tolist(),
    }
    if return_hash:
        output_dict.update(
            {"hash": _get_hash(model_name_or_path, num_layers, idf)})
    return output_dict