def convert_tokens_to_ids(self, tokens): """Converts a sequence of tokens into ids using the vocab.""" ids = [] for token in tokens: ids.append(self.vocab[token]) if len(ids) > self.max_len: logger.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this BERT model ({} > {}). Running this" " sequence through BERT will result in indexing errors".format( len(ids), self.max_len)) return ids
def save_vocabulary(self, vocab_path): """Save the tokenizer vocabulary to a directory or file.""" index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_NAME) else: vocab_file = vocab_path with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." " Please check that the vocabulary is not corrupted!".format(vocab_file)) index = token_index writer.write(token + u'\n') index += 1 return vocab_file
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False, pooled_cls=True, requires_grad: bool = True, auto_truncate: bool = False, layer_num=12): """ :param ~fastNLP.Vocabulary vocab: 词表 :param str model_dir_or_name: 模型所在目录或者模型的名称。当传入模型所在目录时,目录中应该包含一个词表文件(以.txt作为后缀名), 权重文件(以.bin作为文件后缀名), 配置文件(以.json作为后缀名)。 :param str layers: 输出embedding表示来自于哪些层,不同层的结果按照layers中的顺序在最后一维concat起来。以','隔开层数,层的序号是 从0开始,可以以负数去索引倒数几层。 :param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces 中计算得到它对应的表示。支持 ``last`` , ``first`` , ``avg`` , ``max``。 :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 :param bool include_cls_sep: bool,在bert计算句子的表示的时候,需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样 会使得word embedding的结果比输入的结果长两个token。如果该值为True,则在使用 :class::StackEmbedding 可能会与其它类型的 embedding长度不匹配。 :param bool pooled_cls: 返回的[CLS]是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取[CLS]做预测, 一般该值为True。 :param bool requires_grad: 是否需要gradient以更新Bert的权重。 :param bool auto_truncate: 当句子words拆分为word pieces长度超过bert最大允许长度(一般为512), 自动截掉拆分后的超过510个 word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS] 来进行分类的任务将auto_truncate置为True。 """ super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): logger.warning("For Chinese bert, pooled_method should choose from 'first', 'last' in order to achieve" " faster speed.") warnings.warn("For Chinese bert, pooled_method should choose from 'first', 'last' in order to achieve" " faster speed.") self._word_sep_index = None if '[SEP]' in vocab: self._word_sep_index = vocab['[SEP]'] self.model = _WordBertModel(model_dir_or_name=model_dir_or_name, vocab=vocab, layers=layers, pool_method=pool_method, include_cls_sep=include_cls_sep, pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2, layer_num=layer_num) self.requires_grad = requires_grad self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
def from_pretrained(cls, model_dir_or_name, layer_num=12, *inputs, **kwargs): if layer_num > 12: return None state_dict = kwargs.get('state_dict', None) kwargs.pop('state_dict', None) kwargs.pop('cache_dir', None) kwargs.pop('from_tf', None) # get model dir from name or dir pretrained_model_dir = _get_bert_dir(model_dir_or_name) # Load config config_file = _get_file_name_base_on_postfix(pretrained_model_dir, '.json') config = BertConfig.from_json_file(config_file) if state_dict is None: weights_path = _get_file_name_base_on_postfix( pretrained_model_dir, '.bin') state_dict = torch.load(weights_path, map_location='cpu') else: logger.error( f'Cannot load parameters through `state_dict` variable.') raise RuntimeError( f'Cannot load parameters through `state_dict` variable.') model_type = 'BERT' old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None for key_name in BERT_KEY_RENAME_MAP_1: if key_name in key: new_key = key.replace(key_name, BERT_KEY_RENAME_MAP_1[key_name]) if 'distilbert' in key: model_type = 'DistilBert' break if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None for key_name in BERT_KEY_RENAME_MAP_2: if key_name in key: new_key = key.replace(key_name, BERT_KEY_RENAME_MAP_2[key_name]) break if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) ## 下段为fastHan处理所需 old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None for key_name in ['embed.model.encoder']: if key_name in key: new_key = key.replace(key_name, 'bert') break if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) # Instantiate model. config.num_hidden_layers = layer_num model = cls(config, model_type=model_type, *inputs, **kwargs) missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') load(model, prefix='' if hasattr(model, 'bert') else 'bert.') if len(missing_keys) > 0: logger.warning( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) #if len(unexpected_keys) > 0: # logger.warning("Weights from pretrained model not used in {}: {}".format( # model.__class__.__name__, unexpected_keys)) logger.info( f"Load pre-trained {model_type} parameters from file {weights_path}." ) return model