def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration. Parameters: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. return_unused_kwargs: (`optional`) bool: - If False, then this function returns just the final configuration object. - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. Examples:: # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a # derived class: BertConfig config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) assert config.output_attention == True config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True) assert config.output_attention == True assert unused_kwargs == {'foo': False} """ cache_dir = kwargs.pop('cache_dir', None) force_download = kwargs.pop('force_download', False) proxies = kwargs.pop('proxies', None) return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) if pretrained_model_name_or_path in cls.pretrained_config_archive_map: config_file = cls.pretrained_config_archive_map[ pretrained_model_name_or_path] elif os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) else: config_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) except EnvironmentError as e: if pretrained_model_name_or_path in cls.pretrained_config_archive_map: logger.error( "Couldn't reach server at '{}' to download pretrained model configuration file." .format(config_file)) else: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find any file " "associated to this path or url.".format( pretrained_model_name_or_path, ', '.join(cls.pretrained_config_archive_map.keys()), config_file)) raise e if resolved_config_file == config_file: logger.info("loading configuration file {}".format(config_file)) else: logger.info( "loading configuration file {} from cache at {}".format( config_file, resolved_config_file)) # Load config config = cls.from_json_file(resolved_config_file) # Update config with kwargs if needed to_remove = [] for key, value in kwargs.items(): if hasattr(config, key): setattr(config, key, value) to_remove.append(key) for key in to_remove: kwargs.pop(key, None) logger.info("Model config %s", config) if return_unused_kwargs: return config, kwargs else: return config
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r"""Instantiate a pretrained pytorch model from a pre-trained model configuration. The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with ``model.train()`` The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning task. The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. Parameters: pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. force_download: (`optional`) boolean, default False: Force to (re-)download the model weights and configuration files and override the cached versions if they exists. proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. output_loading_info: (`optional`) boolean: Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop('config', None) state_dict = kwargs.pop('state_dict', None) cache_dir = kwargs.pop('cache_dir', None) from_tf = kwargs.pop('from_tf', False) force_download = kwargs.pop('force_download', False) proxies = kwargs.pop('proxies', None) output_loading_info = kwargs.pop('output_loading_info', False) # Load config if config is None: config, model_kwargs = cls.config_class.from_pretrained( pretrained_model_name_or_path, *model_args, cache_dir=cache_dir, return_unused_kwargs=True, force_download=force_download, **kwargs) else: model_kwargs = kwargs # Load model if pretrained_model_name_or_path in cls.pretrained_model_archive_map: archive_file = cls.pretrained_model_archive_map[ pretrained_model_name_or_path] elif os.path.isdir(pretrained_model_name_or_path): if from_tf: # Directly load from a TensorFlow checkpoint archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index") else: archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) else: if from_tf: # Directly load from a TensorFlow checkpoint archive_file = pretrained_model_name_or_path + ".index" else: archive_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) except EnvironmentError as e: if pretrained_model_name_or_path in cls.pretrained_model_archive_map: logger.error( "Couldn't reach server at '{}' to download pretrained weights." .format(archive_file)) else: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find any file " "associated to this path or url.".format( pretrained_model_name_or_path, ', '.join(cls.pretrained_model_archive_map.keys()), archive_file)) raise e if resolved_archive_file == archive_file: logger.info("loading weights file {}".format(archive_file)) else: logger.info("loading weights file {} from cache at {}".format( archive_file, resolved_archive_file)) # Instantiate model. model = cls(config, *model_args, **model_kwargs) if state_dict is None and not from_tf: state_dict = torch.load(resolved_archive_file, map_location='cpu') if from_tf: # Directly load from a TensorFlow checkpoint return cls.load_tf_weights( model, config, resolved_archive_file[:-6]) # Remove the '.index' # Convert old format to new format if needed from a PyTorch state_dict old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None if 'gamma' in key: new_key = key.replace('gamma', 'weight') if 'beta' in key: new_key = key.replace('beta', 'bias') if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) # Load from a PyTorch state_dict missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') # Make sure we are able to load base models as well as derived models (with heads) start_prefix = '' model_to_load = model if not hasattr(model, cls.base_model_prefix) and any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): start_prefix = cls.base_model_prefix + '.' if hasattr(model, cls.base_model_prefix) and not any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): model_to_load = getattr(model, cls.base_model_prefix) load(model_to_load, prefix=start_prefix) if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) if len(error_msgs) > 0: raise RuntimeError( 'Error(s) in loading state_dict for {}:\n\t{}'.format( model.__class__.__name__, "\n\t".join(error_msgs))) if hasattr(model, 'tie_weights'): model.tie_weights( ) # make sure word embedding weights are still tied # Set model in evaluation mode to desactivate DropOut modules by default model.eval() if output_loading_info: loading_info = { "missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs } return model, loading_info return model
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): """ Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict. Download and cache the pre-trained model file if needed. Params: pretrained_model_name_or_path: either: - a str with the name of a pre-trained model to load selected in the list of: . `bert-base-uncased` . `bert-large-uncased` . `bert-base-cased` . `bert-large-cased` . `bert-base-multilingual-uncased` . `bert-base-multilingual-cased` . `bert-base-chinese` - a path or url to a pretrained model archive containing: . `bert_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance - a path or url to a pretrained model archive containing: . `bert_config.json` a configuration file for the model . `model.chkpt` a TensorFlow checkpoint from_tf: should we load the weights from a locally saved TensorFlow checkpoint cache_dir: an optional path to a folder in which the pre-trained models will be cached. state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) """ state_dict = kwargs.get('state_dict', None) kwargs.pop('state_dict', None) cache_dir = kwargs.get('cache_dir', None) kwargs.pop('cache_dir', None) from_tf = kwargs.get('from_tf', False) kwargs.pop('from_tf', None) if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path] else: archive_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) except EnvironmentError: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find any file " "associated to this path or url.".format( pretrained_model_name_or_path, ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), archive_file)) return None if resolved_archive_file == archive_file: logger.info("loading archive file {}".format(archive_file)) else: logger.info("loading archive file {} from cache at {}".format( archive_file, resolved_archive_file)) tempdir = None if os.path.isdir(resolved_archive_file) or from_tf: serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info("extracting archive file {} to temp dir {}".format( resolved_archive_file, tempdir)) with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) serialization_dir = tempdir # Load config config_file = os.path.join(serialization_dir, CONFIG_NAME) if not os.path.exists(config_file): # Backward compatibility with old naming format config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME) config = BertConfig.from_json_file(config_file) logger.info("Model config {}".format(config)) # Instantiate model. model = cls(config, *inputs, **kwargs) if state_dict is None and not from_tf: weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) state_dict = torch.load(weights_path, map_location='cpu') if tempdir: # Clean up temp dir shutil.rmtree(tempdir) if from_tf: # Directly load from a TensorFlow checkpoint weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME) return load_tf_weights_in_bert(model, weights_path) # Load from a PyTorch state_dict old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None if 'gamma' in key: new_key = key.replace('gamma', 'weight') if 'beta' in key: new_key = key.replace('beta', 'bias') if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) module._load_from_state_dict( state_dict, prefix, local_metadata, False, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') start_prefix = '' if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()): start_prefix = 'bert.' load(model, prefix=start_prefix) if len(missing_keys) > 0: logger.info("Weights of {} not initialized from pretrained model: {}".format( model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info("Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) if len(error_msgs) > 0: raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( model.__class__.__name__, "\n\t".join(error_msgs))) return model
from pytorch_pretrained_bert.file_utils import cached_path import os import shutil files = [ 'https://raw.githubusercontent.com/glample/tagger/master/dataset/eng.testa', 'https://raw.githubusercontent.com/glample/tagger/master/dataset/eng.testb', 'https://raw.githubusercontent.com/glample/tagger/master/dataset/eng.train', ] target_dir = 'data/conll2003' os.makedirs(target_dir, exist_ok=True) for fname in files: name = os.path.basename(fname) shutil.copyfile(cached_path(fname), os.path.join(target_dir, name)) print(fname, '=>', name)
def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): """ Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. Params: pretrained_model_name: either: - a str with the name of a pre-trained model to load selected in the list of: . `bert-base-uncased` . `bert-large-uncased` . `bert-base-cased` . `bert-base-multilingual` . `bert-base-chinese` - a path or url to a pretrained model archive containing: . `bert_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) """ if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP: archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name] else: archive_file = pretrained_model_name # redirect to the cache, if necessary try: resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) except FileNotFoundError: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find any file " "associated to this path or url.".format( pretrained_model_name, ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name)) return None if resolved_archive_file == archive_file: logger.info("loading archive file {}".format(archive_file)) else: logger.info("loading archive file {} from cache at {}".format( archive_file, resolved_archive_file)) tempdir = None if os.path.isdir(resolved_archive_file): serialization_dir = resolved_archive_file else: # Extract archive to temp dir tempdir = tempfile.mkdtemp() logger.info("extracting archive file {} to temp dir {}".format( resolved_archive_file, tempdir)) with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) serialization_dir = tempdir # Load config config_file = os.path.join(serialization_dir, CONFIG_NAME) config = BertConfig.from_json_file(config_file) logger.info("Model config {}".format(config)) # Instantiate model. model = cls(config, *inputs, **kwargs) weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) state_dict = torch.load(weights_path) missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') load(model, prefix='' if hasattr(model, 'bert') else 'bert.') if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) if tempdir: # Clean up temp dir shutil.rmtree(tempdir) return model
def from_pretrained( cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs ): """ Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict. Download and cache the pre-trained model file if needed. Params: pretrained_model_name_or_path: either: - a str with the name of a pre-trained model to load selected in the list of: . `openai-gpt` - a path or url to a pretrained model archive containing: . `openai_gpt_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance - a path or url to a pretrained model archive containing: . `bert_config.json` a configuration file for the model . a series of NumPy files containing OpenAI TensorFlow trained weights from_tf: should we load the weights from a locally saved TensorFlow checkpoint cache_dir: an optional path to a folder in which the pre-trained models will be cached. state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) """ if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path] config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path] else: archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) # redirect to the cache, if necessary try: resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) resolved_config_file = cached_path(config_file, cache_dir=cache_dir) except EnvironmentError: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find files {} and {} " "at this path or url.".format( pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path, archive_file, config_file ) ) return None if resolved_archive_file == archive_file and resolved_config_file == config_file: logger.info("loading weights file {}".format(archive_file)) logger.info("loading configuration file {}".format(config_file)) else: logger.info("loading weights file {} from cache at {}".format( archive_file, resolved_archive_file)) logger.info("loading configuration file {} from cache at {}".format( config_file, resolved_config_file)) # Load config config = OpenAIGPTConfig.from_json_file(resolved_config_file) logger.info("Model config {}".format(config)) # Instantiate model. model = cls(config, *inputs, **kwargs) if state_dict is None and not from_tf: state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None) if from_tf: # Directly load from a TensorFlow checkpoint (stored as NumPy array) return load_tf_weights_in_openai_gpt(model, resolved_archive_file) old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None if key.endswith(".g"): new_key = key[:-2] + ".weight" elif key.endswith(".b"): new_key = key[:-2] + ".bias" elif key.endswith(".w"): new_key = key[:-2] + ".weight" if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, "_metadata", None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=""): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) module._load_from_state_dict( state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs ) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + ".") start_model = model if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()): start_model = model.transformer load(start_model, prefix="") if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys) ) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys) ) if len(error_msgs) > 0: raise RuntimeError( "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs)) ) # Add additional embeddings for special tokens if needed # This step also make sure we are still sharing the output and input embeddings after loading weights model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special) return model
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): """ Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. """ if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[ pretrained_model_name_or_path] merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[ pretrained_model_name_or_path] special_tokens_file = None else: vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME) if not os.path.exists(special_tokens_file): special_tokens_file = None else: logger.info("loading special tokens file {}".format( special_tokens_file)) # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) except EnvironmentError: if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: logger.error( "Couldn't reach server at '{}' to download vocabulary.". format(vocab_file)) else: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find files {} and {} " "at this path or url.".format( pretrained_model_name_or_path, ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), pretrained_model_name_or_path, vocab_file, merges_file)) return None if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: logger.info("loading vocabulary file {}".format(vocab_file)) logger.info("loading merges file {}".format(merges_file)) else: logger.info("loading vocabulary file {} from cache at {}".format( vocab_file, resolved_vocab_file)) logger.info("loading merges file {} from cache at {}".format( merges_file, resolved_merges_file)) if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: # if we're using a pretrained model, ensure the tokenizer wont index sequences longer # than the number of positional embeddings max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[ pretrained_model_name_or_path] kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) # Instantiate tokenizer. if special_tokens_file and 'special_tokens' not in kwargs: special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1] else: special_tokens = kwargs.pop('special_tokens', []) tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs) return tokenizer