def model_prepare(args: Config): config = AutoConfig.from_pretrained(args.base_model_id, num_labels=args.num_labels) model = AutoModelForSequenceClassification.from_pretrained( args.base_model_id, config=config) tokenizer = AutoTokenizer.from_pretrained(args.base_model_id) return model, tokenizer
def __init__(self, config, *args, **kwargs): tokenizer_config = config.tokenizer_config self._tokenizer = AutoTokenizer.from_pretrained( tokenizer_config.type, **tokenizer_config.params) self._max_seq_length = config.max_seq_length self._probability = getattr(config, "mask_probability", 0.15)
def main(): parser = ArgumentParser("Hugging Face ONNX Exporter tool") parser.add_argument("-m", "--model", type=str, required=True, help="Model's name of path on disk to load.") parser.add_argument( "--features", choices=["default"], default="default", help="Export the model with some additional features.", ) parser.add_argument( "--opset", type=int, default=12, help="ONNX opset version to export the model with (default 12).") parser.add_argument( "--atol", type=float, default=1e-4, help="Absolute difference tolerence when validating the model.") parser.add_argument( "output", type=Path, help="Path indicating where to store generated ONNX model.") # Retrieve CLI arguments args = parser.parse_args() args.output = args.output if args.output.is_file( ) else args.output.joinpath("model.onnx") if not args.output.parent.exists(): args.output.parent.mkdir(parents=True) # Allocate the model tokenizer = AutoTokenizer.from_pretrained(args.model) model = get_model_from_features(args.features, args.model) model_kind, model_onnx_config = check_supported_model_or_raise( model, features=args.features) onnx_config = model_onnx_config(model.config) # Ensure the requested opset is sufficient if args.opset < onnx_config.default_onnx_opset: raise ValueError( f"Opset {args.opset} is not sufficient to export {model_kind}. " f"At least {onnx_config.default_onnx_opset} is required.") onnx_inputs, onnx_outputs = export(tokenizer, model, onnx_config, args.opset, args.output) validate_model_outputs(onnx_config, tokenizer, model, args.output, onnx_outputs, args.atol) logger.info(f"All good, model saved at: {args.output.as_posix()}")
def __init__(self, config, *args, **kwargs): # https://huggingface.co/transformers/model_doc/xlmroberta.html # roberta is with different tokenization of above default (bert) tokenizer_config = config.tokenizer_config self._tokenizer = AutoTokenizer.from_pretrained( tokenizer_config.type, **tokenizer_config.params) self._CLS_TOKEN = self._tokenizer.bos_token # <s> self._SEP_TOKEN = self._tokenizer.sep_token # </s> self._MASK_TOKEN = self._tokenizer.mask_token # <mask> self._PAD_TOKEN_ID = self._tokenizer.pad_token_id # 1 self._max_seq_length = config.max_seq_length self._probability = getattr(config, "mask_probability", 0.15)
def run_predict_with_pipeline(args: Config): """""" from transformers import (pipeline, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline) config = AutoConfig.from_pretrained(args.output_dir, num_labels=args.num_labels, local_files_only=True) model = AutoModelForSequenceClassification.from_pretrained( args.output_dir, config=config, local_files_only=True) tokenizer = AutoTokenizer.from_pretrained(args.output_dir, local_files_only=True) classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, return_all_scores=True) # classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True) ret = classifier('my_test_sentence_1') print(ret)
def __init__( self, model_name_or_path: Optional[str] = None, num_layers: Optional[int] = None, all_layers: bool = False, model: Optional[torch.nn.Module] = None, user_tokenizer: Optional[Any] = None, user_forward_fn: Callable[[torch.nn.Module, Dict[str, torch.Tensor]], torch.Tensor] = None, verbose: bool = False, idf: bool = False, device: Optional[Union[str, torch.device]] = None, max_length: int = 512, batch_size: int = 64, num_threads: int = 4, return_hash: bool = False, lang: str = "en", rescale_with_baseline: bool = False, baseline_path: Optional[str] = None, baseline_url: Optional[str] = None, compute_on_step: Optional[bool] = None, **kwargs: Dict[str, Any], ): super().__init__(compute_on_step=compute_on_step, **kwargs) self.model_name_or_path = model_name_or_path or _DEFAULT_MODEL self.num_layers = num_layers self.all_layers = all_layers self.model = model self.user_forward_fn = user_forward_fn self.verbose = verbose self.idf = idf self.embedding_device = device self.max_length = max_length self.batch_size = batch_size self.num_threads = num_threads self.return_hash = return_hash self.lang = lang self.rescale_with_baseline = rescale_with_baseline self.baseline_path = baseline_path self.baseline_url = baseline_url self.preds: Dict[str, List[torch.Tensor]] = {"input_ids": [], "attention_mask": []} self.target: Dict[str, List[torch.Tensor]] = {"input_ids": [], "attention_mask": []} if user_tokenizer: self.tokenizer = user_tokenizer self.user_tokenizer = True else: if not _TRANSFORMERS_AUTO_AVAILABLE: raise ModuleNotFoundError( "`BERTScore` metric with default tokenizers requires `transformers` package be installed." " Either install with `pip install transformers>=4.0` or `pip install torchmetrics[text]`." ) if model_name_or_path is None: warn( "The argument `model_name_or_path` was not specified while it is required when the default" " `transformers` model is used." f" It will use the default recommended model - {_DEFAULT_MODEL!r}." ) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path) self.user_tokenizer = False self.add_state("preds_input_ids", [], dist_reduce_fx="cat") self.add_state("preds_attention_mask", [], dist_reduce_fx="cat") self.add_state("target_input_ids", [], dist_reduce_fx="cat") self.add_state("target_attention_mask", [], dist_reduce_fx="cat")
def main(): parser = ArgumentParser("Hugging Face Transformers ONNX exporter") parser.add_argument( "-m", "--model", type=str, required=True, help="Model ID on huggingface.co or path on disk to load model from.") parser.add_argument( "--feature", choices=list(FeaturesManager.AVAILABLE_FEATURES), default="default", help="The type of features to export the model with.", ) parser.add_argument("--opset", type=int, default=None, help="ONNX opset version to export the model with.") parser.add_argument( "--atol", type=float, default=None, help="Absolute difference tolerence when validating the model.") parser.add_argument( "output", type=Path, help="Path indicating where to store generated ONNX model.") # Retrieve CLI arguments args = parser.parse_args() args.output = args.output if args.output.is_file( ) else args.output.joinpath("model.onnx") if not args.output.parent.exists(): args.output.parent.mkdir(parents=True) # Allocate the model tokenizer = AutoTokenizer.from_pretrained(args.model) model = FeaturesManager.get_model_from_feature(args.feature, args.model) model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise( model, feature=args.feature) onnx_config = model_onnx_config(model.config) # Ensure the requested opset is sufficient if args.opset is None: args.opset = onnx_config.default_onnx_opset if args.opset < onnx_config.default_onnx_opset: raise ValueError( f"Opset {args.opset} is not sufficient to export {model_kind}. " f"At least {onnx_config.default_onnx_opset} is required.") onnx_inputs, onnx_outputs = export(tokenizer, model, onnx_config, args.opset, args.output) if args.atol is None: args.atol = onnx_config.atol_for_validation validate_model_outputs(onnx_config, tokenizer, model, args.output, onnx_outputs, args.atol) logger.info(f"All good, model saved at: {args.output.as_posix()}")
def bert_score( preds: Union[List[str], Dict[str, Tensor]], target: Union[List[str], Dict[str, Tensor]], model_name_or_path: Optional[str] = None, num_layers: Optional[int] = None, all_layers: bool = False, model: Optional[torch.nn.Module] = None, user_tokenizer: Any = None, user_forward_fn: Callable[[torch.nn.Module, Dict[str, Tensor]], Tensor] = None, verbose: bool = False, idf: bool = False, device: Optional[Union[str, torch.device]] = None, max_length: int = 512, batch_size: int = 64, num_threads: int = 4, return_hash: bool = False, lang: str = "en", rescale_with_baseline: bool = False, baseline_path: Optional[str] = None, baseline_url: Optional[str] = None, ) -> Dict[str, Union[List[float], str]]: """`Bert_score Evaluating Text Generation`_ leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference sentences by cosine similarity. It has been shown to correlate with human judgment on sentence-level and system-level evaluation. Moreover, BERTScore computes precision, recall, and F1 measure, which can be useful for evaluating different language generation tasks. This implemenation follows the original implementation from `BERT_score`_. Args: preds: Either an iterable of predicted sentences or a ``Dict[input_ids, attention_mask]``. target: Either an iterable of target sentences or a ``Dict[input_ids, attention_mask]``. model_name_or_path: A name or a model path used to load ``transformers`` pretrained model. num_layers: A layer of representation to use. all_layers: An indication of whether the representation from all model's layers should be used. If ``all_layers = True``, the argument ``num_layers`` is ignored. model: A user's own model. user_tokenizer: A user's own tokenizer used with the own model. This must be an instance with the ``__call__`` method. This method must take an iterable of sentences (``List[str]``) and must return a python dictionary containing ``"input_ids"`` and ``"attention_mask"`` represented by ``torch.Tensor``. It is up to the user's model of whether ``"input_ids"`` is a ``torch.Tensor`` of input ids or embedding vectors. his tokenizer must prepend an equivalent of ``[CLS]`` token and append an equivalent of ``[SEP]`` token as `transformers` tokenizer does. user_forward_fn: A user's own forward function used in a combination with ``user_model``. This function must take ``user_model`` and a python dictionary of containing ``"input_ids"`` and ``"attention_mask"`` represented by ``torch.Tensor`` as an input and return the model's output represented by the single ``torch.Tensor``. verbose: An indication of whether a progress bar to be displayed during the embeddings' calculation. idf: An indication of whether normalization using inverse document frequencies should be used. device: A device to be used for calculation. max_length: A maximum length of input sequences. Sequences longer than ``max_length`` are to be trimmed. batch_size: A batch size used for model processing. num_threads: A number of threads to use for a dataloader. return_hash: An indication of whether the correspodning ``hash_code`` should be returned. lang: A language of input sentences. It is used when the scores are rescaled with a baseline. rescale_with_baseline: An indication of whether bertscore should be rescaled with a pre-computed baseline. When a pretrained model from ``transformers`` model is used, the corresponding baseline is downloaded from the original ``bert-score`` package from `BERT_score`_ if available. In other cases, please specify a path to the baseline csv/tsv file, which must follow the formatting of the files from `BERT_score`_ baseline_path: A path to the user's own local csv/tsv file with the baseline scale. baseline_url: A url path to the user's own csv/tsv file with the baseline scale. Returns: Python dictionary containing the keys ``precision``, ``recall`` and ``f1`` with corresponding values. Raises: ValueError: If ``len(preds) != len(target)``. ModuleNotFoundError: If `tqdm` package is required and not installed. ModuleNotFoundError: If ``transformers`` package is required and not installed. ValueError: If ``num_layer`` is larger than the number of the model layers. ValueError: If invalid input is provided. Example: >>> from torchmetrics.functional.text.bert import bert_score >>> preds = ["hello there", "general kenobi"] >>> target = ["hello there", "master kenobi"] >>> score = bert_score(preds, target) >>> from pprint import pprint >>> rounded_score = {k: [round(v, 3) for v in vv] for k, vv in score.items()} >>> pprint(rounded_score) {'f1': [1.0, 0.996], 'precision': [1.0, 0.996], 'recall': [1.0, 0.996]} """ if len(preds) != len(target): raise ValueError( "Number of predicted and reference sententes must be the same!") if verbose and (not _TQDM_AVAILABLE): raise ModuleNotFoundError( "An argument `verbose = True` requires `tqdm` package be installed. Install with `pip install tqdm`." ) if model is None: if not _TRANSFORMERS_AUTO_AVAILABLE: raise ModuleNotFoundError( "`bert_score` metric with default models requires `transformers` package be installed." " Either install with `pip install transformers>=4.0` or `pip install torchmetrics[text]`." ) if model_name_or_path is None: warn( "The argument `model_name_or_path` was not specified while it is required when default" " `transformers` model are used." f"It is, therefore, used the default recommended model - {_DEFAULT_MODEL}." ) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path or _DEFAULT_MODEL) model = AutoModel.from_pretrained(model_name_or_path or _DEFAULT_MODEL) else: tokenizer = user_tokenizer model.eval() model.to(device) try: if num_layers and num_layers > model.config.num_hidden_layers: # type: ignore raise ValueError( f"num_layers={num_layers} is forbidden for {model_name_or_path}. " # type: ignore f"Please use num_layers <= {model.config.num_hidden_layers}" # type: ignore ) except AttributeError: warn( "It was not possible to retrieve the parameter `num_layers` from the model specification." ) _are_empty_lists = all( isinstance(text, list) and len(text) == 0 for text in (preds, target)) _are_valid_lists = all( isinstance(text, list) and len(text) > 0 and isinstance(text[0], str) for text in (preds, target)) _are_valid_tensors = all( isinstance(text, dict) and isinstance(text["input_ids"], Tensor) for text in (preds, target)) if _are_empty_lists: warn("Predictions and references are empty.") output_dict: Dict[str, Union[List[float], str]] = { "precision": [0.0], "recall": [0.0], "f1": [0.0], } if return_hash: output_dict.update( {"hash": _get_hash(model_name_or_path, num_layers, idf)}) return output_dict # Load baselines if needed baseline = _load_baseline(lang, model_name_or_path, baseline_path, baseline_url) if rescale_with_baseline else None # We ignore mypy typing below as the proper typing is ensured by conditions above, only mypy cannot infer that. if _are_valid_lists: target_dataset = TextDataset(target, tokenizer, max_length, idf=idf) # type: ignore preds_dataset = TextDataset( preds, # type: ignore tokenizer, max_length, idf=idf, tokens_idf=target_dataset.tokens_idf, ) elif _are_valid_tensors: target_dataset = TokenizedDataset(**target, idf=idf) # type: ignore preds_dataset = TokenizedDataset( **preds, idf=idf, tokens_idf=target_dataset.tokens_idf) # type: ignore else: raise ValueError("Invalid input provided.") target_loader = DataLoader(target_dataset, batch_size=batch_size, num_workers=num_threads) preds_loader = DataLoader(preds_dataset, batch_size=batch_size, num_workers=num_threads) target_embeddings, target_idf_scale = _get_embeddings_and_idf_scale( target_loader, target_dataset.max_length, model, device, num_layers, all_layers, idf, verbose, user_forward_fn) preds_embeddings, preds_idf_scale = _get_embeddings_and_idf_scale( preds_loader, preds_dataset.max_length, model, device, num_layers, all_layers, idf, verbose, user_forward_fn) precision, recall, f1_score = _get_precision_recall_f1( preds_embeddings, target_embeddings, preds_idf_scale, target_idf_scale) if baseline is not None: precision, recall, f1_score = _rescale_metrics_with_baseline( precision, recall, f1_score, baseline, num_layers, all_layers) output_dict = { "precision": precision.tolist(), "recall": recall.tolist(), "f1": f1_score.tolist(), } if return_hash: output_dict.update( {"hash": _get_hash(model_name_or_path, num_layers, idf)}) return output_dict