def test_new_tokenizer_fast_registration(self): try: AutoConfig.register("custom", CustomConfig) # Can register in two steps AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer) self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, None)) AutoTokenizer.register(CustomConfig, fast_tokenizer_class=CustomTokenizerFast) self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast)) del TOKENIZER_MAPPING._extra_content[CustomConfig] # Can register in one step AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer, fast_tokenizer_class=CustomTokenizerFast) self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast)) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): AutoTokenizer.register(BertConfig, fast_tokenizer_class=BertTokenizerFast) # We pass through a bert tokenizer fast cause there is no converter slow to fast for our new toknizer # and that model does not have a tokenizer.json with tempfile.TemporaryDirectory() as tmp_dir: bert_tokenizer = BertTokenizerFast.from_pretrained( SMALL_MODEL_IDENTIFIER) bert_tokenizer.save_pretrained(tmp_dir) tokenizer = CustomTokenizerFast.from_pretrained(tmp_dir) with tempfile.TemporaryDirectory() as tmp_dir: tokenizer.save_pretrained(tmp_dir) new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir) self.assertIsInstance(new_tokenizer, CustomTokenizerFast) new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, use_fast=False) self.assertIsInstance(new_tokenizer, CustomTokenizer) finally: if "custom" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in TOKENIZER_MAPPING._extra_content: del TOKENIZER_MAPPING._extra_content[CustomConfig]
def test_configuration_not_found(self): with self.assertRaisesRegex( EnvironmentError, "hf-internal-testing/no-config-test-repo does not appear to have a file named config.json.", ): _ = AutoConfig.from_pretrained( "hf-internal-testing/no-config-test-repo")
def test_tokenizer_from_tokenizer_class(self): config = AutoConfig.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER) self.assertIsInstance(config, RobertaConfig) # Check that tokenizer_type ≠ model_type tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config) self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast)) self.assertEqual(tokenizer.vocab_size, 12)
def test_revision_not_found(self): with self.assertRaisesRegex( EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)" ): _ = AutoConfig.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_pattern_matching_fallback(self): with tempfile.TemporaryDirectory() as tmp_dir: # This model name contains bert and roberta, but roberta ends up being picked. folder = os.path.join(tmp_dir, "fake-roberta") os.makedirs(folder, exist_ok=True) with open(os.path.join(folder, "config.json"), "w") as f: f.write(json.dumps({})) config = AutoConfig.from_pretrained(folder) self.assertEqual(type(config), RobertaConfig)
def test_new_tokenizer_registration(self): try: AutoConfig.register("new-model", NewConfig) AutoTokenizer.register(NewConfig, slow_tokenizer_class=NewTokenizer) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): AutoTokenizer.register(BertConfig, slow_tokenizer_class=BertTokenizer) tokenizer = NewTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER) with tempfile.TemporaryDirectory() as tmp_dir: tokenizer.save_pretrained(tmp_dir) new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir) self.assertIsInstance(new_tokenizer, NewTokenizer) finally: if "new-model" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["new-model"] if NewConfig in TOKENIZER_MAPPING._extra_content: del TOKENIZER_MAPPING._extra_content[NewConfig]
def test_new_config_registration(self): try: AutoConfig.register("custom", CustomConfig) # Wrong model type will raise an error with self.assertRaises(ValueError): AutoConfig.register("model", CustomConfig) # Trying to register something existing in the Transformers library will raise an error with self.assertRaises(ValueError): AutoConfig.register("bert", BertConfig) # Now that the config is registered, it can be used as any other config with the auto-API config = CustomConfig() with tempfile.TemporaryDirectory() as tmp_dir: config.save_pretrained(tmp_dir) new_config = AutoConfig.from_pretrained(tmp_dir) self.assertIsInstance(new_config, CustomConfig) finally: if "custom" in CONFIG_MAPPING._extra_content: del CONFIG_MAPPING._extra_content["custom"]
def test_repo_not_found(self): with self.assertRaisesRegex( EnvironmentError, "bert-base is not a local folder and is not a valid model identifier" ): _ = AutoConfig.from_pretrained("bert-base")
def test_config_for_model_str(self): config = AutoConfig.for_model("roberta") self.assertIsInstance(config, RobertaConfig)
def test_config_model_type_from_model_identifier(self): config = AutoConfig.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER) self.assertIsInstance(config, RobertaConfig)
def test_config_model_type_from_local_file(self): config = AutoConfig.from_pretrained(SAMPLE_ROBERTA_CONFIG) self.assertIsInstance(config, RobertaConfig)
def test_config_from_model_shortcut(self): config = AutoConfig.from_pretrained("bert-base-uncased") self.assertIsInstance(config, BertConfig)
configuration = config.Configuration( model_parameters=model_config, model=args.model, save_path=args.output_dir, sequence_max_len=args.seq_len, batch_size=args.batch_size, epochs=args.epochs, device=torch.device(args.device), tokenizer=tokenizer, ) valid_data_loader = SmartParaphraseDataloader.build_batches( valid_dataset, 16, mode="sequence", config=configuration) autoconfig = AutoConfig.from_pretrained( args.pretrained_model_path, output_attentions=True, ) autoconfig.num_labels = len(LABELS_TO_ID) model = AutoModelForSequenceClassification.from_pretrained( args.pretrained_model_path, config=autoconfig) """ model = TransformerWrapper.load_pretrained( args.pretrained_model_path, params=configuration, pooler = BertPoolingStrategy(configuration), loss = SoftmaxLoss(configuration)) model_config = config.ModelParameters( model_name = args.config_name, hidden_size = args.embed_dim,
def pipeline( task: str, model: Optional = None, config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, framework: Optional[str] = None, revision: Optional[str] = None, onnx_model_dir : Optional[str] = None, use_fast: bool = True, model_kwargs: Dict[str, Any] = {}, onnx: bool = True, optimization_level : str = 'all', **kwargs ) -> Pipeline: """ Utility factory method to build a :class:`~transformers.Pipeline`. Pipelines are made of: - A :doc:`tokenizer <tokenizer>` in charge of mapping raw textual input to token. - A :doc:`model <model>` to make predictions from the inputs. - Some (optional) post processing for enhancing model's output. Args: task (:obj:`str`): The task defining which pipeline will be returned. Currently accepted tasks are: - :obj:`"feature-extraction"`: will return a :class:`~transformers.FeatureExtractionPipeline`. - :obj:`"sentiment-analysis"`: will return a :class:`~transformers.TextClassificationPipeline`. - :obj:`"ner"`: will return a :class:`~transformers.TokenClassificationPipeline`. - :obj:`"question-answering"`: will return a :class:`~transformers.QuestionAnsweringPipeline`. - :obj:`"fill-mask"`: will return a :class:`~transformers.FillMaskPipeline`. - :obj:`"summarization"`: will return a :class:`~transformers.SummarizationPipeline`. - :obj:`"translation_xx_to_yy"`: will return a :class:`~transformers.TranslationPipeline`. - :obj:`"text2text-generation"`: will return a :class:`~transformers.Text2TextGenerationPipeline`. - :obj:`"text-generation"`: will return a :class:`~transformers.TextGenerationPipeline`. - :obj:`"zero-shot-classification:`: will return a :class:`~transformers.ZeroShotClassificationPipeline`. - :obj:`"conversational"`: will return a :class:`~transformers.ConversationalPipeline`. model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`): The model that will be used by the pipeline to make predictions. This can be a model identifier or an actual instance of a pretrained model inheriting from :class:`~transformers.PreTrainedModel` (for PyTorch) or :class:`~transformers.TFPreTrainedModel` (for TensorFlow). If not provided, the default for the :obj:`task` will be loaded. config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`): The configuration that will be used by the pipeline to instantiate the model. This can be a model identifier or an actual pretrained model configuration inheriting from :class:`~transformers.PretrainedConfig`. If not provided, the default configuration file for the requested model will be used. That means that if :obj:`model` is given, its default configuration will be used. However, if :obj:`model` is not supplied, this :obj:`task`'s default model's config is used instead. tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`): The tokenizer that will be used by the pipeline to encode data for the model. This can be a model identifier or an actual pretrained tokenizer inheriting from :class:`~transformers.PreTrainedTokenizer`. If not provided, the default tokenizer for the given :obj:`model` will be loaded (if it is a string). If :obj:`model` is not specified or not a string, then the default tokenizer for :obj:`config` is loaded (if it is a string). However, if :obj:`config` is also not given or not a string, then the default tokenizer for the given :obj:`task` will be loaded. framework (:obj:`str`, `optional`): The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model is provided. revision(:obj:`str`, `optional`, defaults to :obj:`"main"`): When passing a task name or a string model identifier: The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git. use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`). model_kwargs: Additional dictionary of keyword arguments passed along to the model's :obj:`from_pretrained(., **model_kwargs)` function. kwargs: Additional keyword arguments passed along to the specific pipeline init (see the documentation for the corresponding pipeline class for possible values). Returns: :class:`~transformers.Pipeline`: A suitable pipeline for the task. Examples:: >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer >>> # Sentiment analysis pipeline >>> pipeline('sentiment-analysis') >>> # Question answering pipeline, specifying the checkpoint identifier >>> pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased') >>> # Named entity recognition pipeline, passing in a specific model and tokenizer >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") >>> pipeline('ner', model=model, tokenizer=tokenizer) """ # Retrieve the task targeted_task, task_options = check_task(task) # Use default model/config/tokenizer for the task if no model is provided if model is None: # At that point framework might still be undetermined model = get_default_model(targeted_task, framework, task_options) # Try to infer tokenizer from model or config name (if provided as str) if tokenizer is None: if isinstance(model, str): tokenizer = model elif isinstance(config, str): tokenizer = config else: # Impossible to guest what is the right tokenizer here raise Exception( "Impossible to guess which tokenizer to use. " "Please provided a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer." ) modelcard = None # Try to infer modelcard from model or config name (if provided as str) if isinstance(model, str): modelcard = model elif isinstance(config, str): modelcard = config # Instantiate config if config is not None and isinstance(config, str): config = AutoConfig.from_pretrained(config,revision=revision) elif config is None: config = AutoConfig.from_pretrained(model,revision=revision) if onnx_model_dir: ONNX_CACHE_DIR = Path(onnx_model_dir) else: ONNX_CACHE_DIR = Path(os.path.dirname(__file__)).parent.joinpath(".onnx") graph_name = f"{os.path.basename(model)}_{task}.onnx" graph_path = ONNX_CACHE_DIR.joinpath(model, graph_name) # Infer the framework form the model if framework is None: framework, model = infer_framework_from_model(model, targeted_task, revision=revision, task=task,onnx=onnx,graph_path=graph_path) task_class, model_class = targeted_task["impl"], targeted_task[framework] # Instantiate tokenizer if needed if isinstance(tokenizer, (str, tuple)): if isinstance(tokenizer, tuple): # For tuple we have (tokenizer name, {kwargs}) use_fast = tokenizer[1].pop("use_fast", use_fast) tokenizer = AutoTokenizer.from_pretrained( tokenizer[0], use_fast=use_fast, revision=revision, **tokenizer[1] ) else: tokenizer = AutoTokenizer.from_pretrained( tokenizer, revision=revision, use_fast=use_fast ) # Instantiate model if needed if (onnx and not os.path.exists(graph_path)) or not onnx: if isinstance(model, str): # Handle transparent TF/PT model conversion if framework == "pt" and model.endswith(".h5"): model_kwargs["from_tf"] = True logger.warning( "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. " "Trying to load the model with PyTorch." ) elif framework == "tf" and model.endswith(".bin"): model_kwargs["from_pt"] = True logger.warning( "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " "Trying to load the model with Tensorflow." ) if model_class is None: raise ValueError( f"Pipeline using {framework} framework, but this framework is not supported by this pipeline." ) model = model_class.from_pretrained( model, config=config, revision=revision, **model_kwargs ) if task == "translation" and model.config.task_specific_params: for key in model.config.task_specific_params: if key.startswith("translation"): task = key warnings.warn( f'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{task}"', UserWarning, ) break return task_class(model=model, tokenizer=tokenizer,config=config, modelcard=modelcard, framework=framework, task=task, onnx=onnx,graph_path=graph_path,optimization_level=optimization_level,**kwargs)
def load( tag: t.Union[str, Tag], from_tf: bool = False, from_flax: bool = False, *, return_config: bool = False, model_store: "ModelStore" = Provide[BentoMLContainer.model_store], **kwargs: t.Any, ) -> t.Union["ext.TransformersPipeline", t.Tuple["ext.PretrainedConfig", "ext.TransformersModelType", t.Union["ext.TransformersTokenizerType", "ext.PreTrainedFeatureExtractor"], ], t.Tuple[ None, "ext.TransformersModelType", t.Union[ "ext.TransformersTokenizerType", "ext.PreTrainedFeatureExtractor"], ], ]: """ Load a model from BentoML local modelstore with given name. Args: tag (:code:`Union[str, Tag]`): Tag of a saved model in BentoML local modelstore. model_store (:mod:`~bentoml._internal.models.store.ModelStore`, default to :mod:`BentoMLContainer.model_store`): BentoML modelstore, provided by DI Container. from_tf (:code:`bool`, `optional`, defaults to :code:`False`): Load the model weights from a TensorFlow checkpoint save file. from_flax (:code:`bool`, `optional`, defaults to :code:`False`): Load the model weights from a Flax checkpoint save file return_config (:code:`bool`, `optional`, default to :code:`False`): Whether or not to return configuration of the Transformers model. config_kwargs (:code:`Dict[str, Any]`, `optional`): Kwargs to pass into :code:`Config` object. model_kwargs (:code:`Dict[str, Any]`, `optional`): Kwargs to pass into :code:`Model` object. tokenizer_kwargs (:code:`Dict[str, Any]`, `optional`): Kwargs to pass into :code:`Tokenizer` object. feature_extractor_kwargs (:code:`Dict[str, Any]`, `optional`): Kwargs to pass into :code:`FeatureExtractor` object. kwargs (:code:`Dict[str, Any]`, `optional`): Other kwargs that can be parsed to transformers that is neither configs, model, tokenizer, and feature extractor. .. warnings:: Make sure to add the corresponding kwargs for your Transformers :code:`Model`, :code:`Tokenizer`, :code:`Config`, :code:`FeatureExtractor` to the correct kwargs dict. .. warnings:: Currently :code:`kwargs` accepts all kwargs for corresponding Pipeline. Returns: :obj:`Union[Pipeline, Tuple[Optional[PretrainedConfig], Union[PreTrainedModel, TFPreTrainedModel, FlaxPreTrainedModel], Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast, PreTrainedFeatureExtractor]]]]`: either returning a pipeline or a tuple containing :obj:`PretrainedConfig`, :obj:`Model` class object defined by :obj:`transformers`, with an optional :obj:`Tokenizer` class, or :obj:`FeatureExtractor` class for the given model saved in BentoML modelstore. Examples: .. code-block:: python import bentoml model, tokenizer = bentoml.transformers.load('custom_gpt2') If you want to returns an config object: .. code-block:: python import bentoml config, model, tokenizer = bentoml.transformers.load('custom_gpt2', return_config=True, tokenizer_kwargs={"use_fast":True}) If the pipeline is saved with :code:`bentoml.transformers.save()`, then :code:`load()` will return pipeline objects: .. code-block:: python import bentoml pipeline = bentoml.transformers.load("roberta_text_classification", return_all_scores=True) """ # noqa check_flax_supported() # pragma: no cover model = model_store.get(tag) if model.info.module not in (MODULE_NAME, __name__): raise BentoMLException( f"Model {tag} was saved with module {model.info.module}, failed loading with {MODULE_NAME}." ) if model.info.context["pipeline"]: _tasks = model.info.context["task"] return transformers.pipeline(_tasks, model.path, **kwargs) else: config_kwargs = kwargs.pop("config_kwargs", {}) config: "ext.PretrainedConfig" = AutoConfig.from_pretrained( model.path, **config_kwargs) model_kwargs = kwargs.pop("model_kwargs", {}) _model, _tokenizer = ( model.info.options["model"], model.info.options["tokenizer"], ) _feature_extractor = model.info.options["feature_extractor"] if _tokenizer is False: tokenizer: t.Optional["ext.TransformersTokenizerType"] = None else: tokenizer_kwargs = kwargs.pop("tokenizer_kwargs", {}) tokenizer = getattr(import_module("transformers"), _tokenizer).from_pretrained( model.path, from_tf=from_tf, from_flax=from_flax, **tokenizer_kwargs) if _feature_extractor is False: feature_extractor: t.Optional[ "ext.PreTrainedFeatureExtractor"] = None else: feature_extractor_kwargs = kwargs.pop("feature_extractor_kwargs ", {}) feature_extractor = getattr(import_module("transformers"), _feature_extractor).from_pretrained( model.path, **feature_extractor_kwargs) tfe = tokenizer if tokenizer is not None else feature_extractor tmodel: "ext.TransformersModelType" = getattr( import_module("transformers"), _model).from_pretrained( # type: ignore[reportUnknownMemberType] model.path, config=config, **model_kwargs, ) if return_config: return config, tmodel, tfe # type: ignore return None, tmodel, tfe # type: ignore
def test_from_pretrained_dynamic_config(self): config = AutoConfig.from_pretrained( "hf-internal-testing/test_dynamic_model", trust_remote_code=True) self.assertEqual(config.__class__.__name__, "NewModelConfig")
def _build_encoder_config(self, config: Config): return AutoConfig.from_pretrained(config.bert_model_name, **OmegaConf.to_container(config))
def main(args): task = GLUE_TASKS[args.task_name] # prepare output dirs if not os.path.isdir(args.output_dir): print('Prepare output dir "{}"'.format(args.output_dir)) os.makedirs(args.output_dir) # the datasets will be stored in <args_output_dir>/datasets/ datasets_cache_dir = os.path.join(args.output_dir, "datasets") if not os.path.isdir(datasets_cache_dir): os.makedirs(datasets_cache_dir) # the checkpoints will be stored in <args_output_dir>/checkpoints/ checkpoints_dir = os.path.join(args.output_dir, "checkpoints") if not os.path.isdir(checkpoints_dir): os.makedirs(checkpoints_dir) # task params tasks_param = setup_tasks(args.wsc_trick) # tokenizer & model print('Loading weights and tokenizer from "{}"'.format( args.model_name_or_path)) config = AutoConfig.from_pretrained(args.model_name_or_path) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path) print("Model config: ", config) # datasets setup dataloaders = store_datasets( task, tasks_param, hf_tokenizer=tokenizer, config=config, cache_dir=datasets_cache_dir, max_length=args.max_seq_length, double_unordered=args.double_unordered, num_workers=args.num_workers, ) # finetune if args.do_train: learn, fit_fc = get_glue_learner( task, tasks_param, config, model, dataloaders, args.weight_decay, tokenizer, args.wsc_trick, args.device, args.learning_rate, args.layer_lr_decay, args.adam_bias_correction, args.schedule, seed=args.seed, run_name=args.output_dir, inference=True, ) fit_fc() learn.save(f"{task}_{args.seed}") # save measures measures = [(measure, str(learn.recorder.log[i])) for i, measure in enumerate(learn.recorder.metric_names)] print('Saving in "{}"'.format(args.output_dir)) with open( os.path.join(args.output_dir, "eval_results_{}.txt".format(task)), "w") as f: f.write("\n".join(" = ".join(m) for m in measures))