def __init__(self, config, *inputs, **kwargs): super(BertModel, self).__init__() if not isinstance(config, BertConfig): raise ValueError( "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " "To create a model from a Google pretrained model use " "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( self.__class__.__name__, self.__class__.__name__)) super(BertModel, self).__init__() self.config = config self.hidden_size = self.config.hidden_size self.model_type = 'bert' if hasattr(config, 'sinusoidal_pos_embds'): self.model_type = 'distilbert' elif 'model_type' in kwargs: self.model_type = kwargs['model_type'].lower() if self.model_type == 'distilbert': self.embeddings = DistilBertEmbeddings(config) else: self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) if self.model_type != 'distilbert': self.pooler = BertPooler(config) else: logger.info( 'DistilBert has NOT pooler, will use hidden states of [CLS] token as pooled output.' ) self.apply(self.init_bert_weights)
def from_pretrained(cls, model_dir_or_name, *inputs, **kwargs): """ 给定模型的名字或者路径,直接读取vocab. """ model_dir = _get_bert_dir(model_dir_or_name) pretrained_model_name_or_path = _get_file_name_base_on_postfix(model_dir, '.txt') logger.info("loading vocabulary file {}".format(pretrained_model_name_or_path)) max_len = 512 kwargs['max_len'] = min(kwargs.get('max_position_embeddings', int(1e12)), max_len) # Instantiate tokenizer. tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs) return tokenizer
def save_pretrained(self, save_directory): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. """ assert os.path.isdir( save_directory ), "Saving path should be a directory where the model and configuration can be saved" # Only save the model itself if we are using distributed training model_to_save = self.module if hasattr(self, "module") else self # Save configuration file model_to_save.config.save_pretrained(save_directory) # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) logger.info("Model weights saved in {}".format(output_model_file))
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True, init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): """ :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。 :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 :param int embedding_dim: 随机初始化的embedding的维度,当该值为大于0的值时,将忽略model_dir_or_name。 :param bool requires_grad: 是否需要gradient. 默认为True :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法, 传入的方法应该接受一个tensor,并 inplace地修改其值。 :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 为大写的词语开辟一个vector表示,则将lower设置为False。 :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 :param dict kwarngs: only_train_min_freq, 仅对train中的词语使用min_freq筛选; only_norm_found_vector是否仅对在预训练中找到的词语使用normalize。 """ super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) if embedding_dim > 0: model_dir_or_name = None # 得到cache_path if model_dir_or_name is None: assert embedding_dim >= 1, "The dimension of embedding should be larger than 1." embedding_dim = int(embedding_dim) model_path = None elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: model_url = _get_embedding_url('static', model_dir_or_name.lower()) model_path = cached_path(model_url, name='embedding') # 检查是否存在 elif os.path.isfile(os.path.abspath(os.path.expanduser(model_dir_or_name))): model_path = os.path.abspath(os.path.expanduser(model_dir_or_name)) elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt') else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") # 根据min_freq缩小vocab truncate_vocab = (vocab.min_freq is None and min_freq > 1) or (vocab.min_freq and vocab.min_freq < min_freq) if truncate_vocab: truncated_vocab = deepcopy(vocab) truncated_vocab.min_freq = min_freq truncated_vocab.word2idx = None if lower: # 如果有lower,将大小写的的freq需要同时考虑到 lowered_word_count = defaultdict(int) for word, count in truncated_vocab.word_count.items(): lowered_word_count[word.lower()] += count for word in truncated_vocab.word_count.keys(): word_count = truncated_vocab.word_count[word] if lowered_word_count[word.lower()] >= min_freq and word_count < min_freq: truncated_vocab.add_word_lst([word] * (min_freq - word_count), no_create_entry=truncated_vocab._is_word_no_create_entry(word)) # 只限制在train里面的词语使用min_freq筛选 if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None: for word in truncated_vocab.word_count.keys(): if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word] < min_freq: truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]), no_create_entry=True) truncated_vocab.build_vocab() truncated_words_to_words = torch.arange(len(vocab)).long() for word, index in vocab: truncated_words_to_words[index] = truncated_vocab.to_index(word) logger.info( f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.") vocab = truncated_vocab self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False) # 读取embedding if lower: lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown) for word, index in vocab: if vocab._is_word_no_create_entry(word): lowered_vocab.add_word(word.lower(), no_create_entry=True) else: lowered_vocab.add_word(word.lower()) # 先加入需要创建entry的 logger.info(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} " f"unique lowered words.") if model_path: embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method) else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) if lowered_vocab.unknown: unknown_idx = lowered_vocab.unknown_idx else: unknown_idx = embedding.size(0) - 1 # 否则是最后一个为unknow self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) words_to_words = torch.full((len(vocab),), fill_value=unknown_idx).long() for word, index in vocab: if word not in lowered_vocab: word = word.lower() if word not in lowered_vocab and lowered_vocab._is_word_no_create_entry(word): continue # 如果不需要创建entry,已经默认unknown了 words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)] self.register_buffer('words_to_words', words_to_words) self._word_unk_index = lowered_vocab.unknown_idx # 替换一下unknown的index else: if model_path: embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method) else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) if not self.only_norm_found_vector and normalize: embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) if truncate_vocab: for i in range(len(truncated_words_to_words)): index_in_truncated_vocab = truncated_words_to_words[i] truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab] del self.words_to_words self.register_buffer('words_to_words', truncated_words_to_words) self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], padding_idx=vocab.padding_idx, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False, _weight=embedding) self._embed_size = self.embedding.weight.size(1) self.requires_grad = requires_grad self.dropout = MyDropout(dropout)
def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', error='ignore', init_method=None): """ 从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是 word2vec(第一行只有两个元素)还是glove格式的数据。 :param str embed_filepath: 预训练的embedding的路径。 :param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型,读取出现在vocab中的词的embedding。 没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来,以使得整个Embedding是同分布的。 :param dtype: 读出的embedding的类型 :param str padding: 词表中padding的token :param str unknown: 词表中unknown的token :param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。 这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。 :param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.zeros_ :return torch.tensor: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。 """ assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported." if not os.path.exists(embed_filepath): raise FileNotFoundError("`{}` does not exist.".format(embed_filepath)) with open(embed_filepath, 'r', encoding='utf-8') as f: line = f.readline().strip() parts = line.split() start_idx = 0 if len(parts) == 2: dim = int(parts[1]) start_idx += 1 else: dim = len(parts) - 1 f.seek(0) matrix = {} if vocab.padding: matrix[vocab.padding_idx] = torch.zeros(dim) if vocab.unknown: matrix[vocab.unknown_idx] = torch.zeros(dim) found_count = 0 found_unknown = False for idx, line in enumerate(f, start_idx): try: parts = line.strip().split() word = ''.join(parts[:-dim]) nums = parts[-dim:] # 对齐unk与pad if word == padding and vocab.padding is not None: word = vocab.padding elif word == unknown and vocab.unknown is not None: word = vocab.unknown found_unknown = True if word in vocab: index = vocab.to_index(word) matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) if self.only_norm_found_vector: matrix[index] = matrix[index] / np.linalg.norm(matrix[index]) found_count += 1 except Exception as e: if error == 'ignore': warnings.warn("Error occurred at the {} line.".format(idx)) else: logger.error("Error occurred at the {} line.".format(idx)) raise e logger.info("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) for word, index in vocab: if index not in matrix and not vocab._is_word_no_create_entry(word): if found_unknown: # 如果有unkonwn,用unknown初始化 matrix[index] = matrix[vocab.unknown_idx] else: matrix[index] = None # matrix中代表是需要建立entry的词 vectors = self._randomly_init_embed(len(matrix), dim, init_method) if vocab.unknown is None: # 创建一个专门的unknown unknown_idx = len(matrix) vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() else: unknown_idx = vocab.unknown_idx self.register_buffer('words_to_words', torch.full((len(vocab),), fill_value=unknown_idx).long()) for index, (index_in_vocab, vec) in enumerate(matrix.items()): if vec is not None: vectors[index] = vec self.words_to_words[index_in_vocab] = index return vectors
def from_pretrained(cls, model_dir_or_name, *model_args, **kwargs): r"""Instantiate a pretrained pytorch model from a pre-trained model configuration. The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with ``model.train()`` The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning task. The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. Parameters: model_dir_or_name: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``) Examples:: model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading assert model.config.output_attention == True # Loading from a TF checkpoint file instead of a PyTorch model (slower) config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ config = kwargs.pop("config", None) state_dict = kwargs.pop("state_dict", None) model_dir = _get_gpt2_dir(model_dir_or_name) # Load config if we don't provide a configuration model_kwargs = {} if not isinstance(config, GPT2Config): config = cls.config_class.from_pretrained(model_dir, *model_args, **kwargs) else: model_kwargs = kwargs # Instantiate model. model = cls(config, *model_args, **model_kwargs) model_path = _get_file_name_base_on_postfix(model_dir, 'model.bin') state_dict = torch.load(model_path, map_location="cpu") missing_keys = [] unexpected_keys = [] error_msgs = [] # Convert old format to new format if needed from a PyTorch state_dict old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None if "gamma" in key: new_key = key.replace("gamma", "weight") if "beta" in key: new_key = key.replace("beta", "bias") if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, "_metadata", None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants # so we need to apply the function recursively. def load(module, prefix=""): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + ".") # Make sure we are able to load base models as well as derived models (with heads) start_prefix = "" model_to_load = model if not hasattr(model, cls.base_model_prefix) and any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): start_prefix = cls.base_model_prefix + "." if hasattr(model, cls.base_model_prefix) and not any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): model_to_load = getattr(model, cls.base_model_prefix) load(model_to_load, prefix=start_prefix) if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) if len(error_msgs) > 0: raise RuntimeError( "Error(s) in loading state_dict for {}:\n\t{}".format( model.__class__.__name__, "\n\t".join(error_msgs))) model.tie_weights( ) # make sure word embedding weights are still tied if needed # Set model in evaluation mode to desactivate DropOut modules by default model.eval() return model
def from_pretrained(cls, model_dir_or_name, layer_num=12, *inputs, **kwargs): if layer_num > 12: return None state_dict = kwargs.get('state_dict', None) kwargs.pop('state_dict', None) kwargs.pop('cache_dir', None) kwargs.pop('from_tf', None) # get model dir from name or dir pretrained_model_dir = _get_bert_dir(model_dir_or_name) # Load config config_file = _get_file_name_base_on_postfix(pretrained_model_dir, '.json') config = BertConfig.from_json_file(config_file) if state_dict is None: weights_path = _get_file_name_base_on_postfix( pretrained_model_dir, '.bin') state_dict = torch.load(weights_path, map_location='cpu') else: logger.error( f'Cannot load parameters through `state_dict` variable.') raise RuntimeError( f'Cannot load parameters through `state_dict` variable.') model_type = 'BERT' old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None for key_name in BERT_KEY_RENAME_MAP_1: if key_name in key: new_key = key.replace(key_name, BERT_KEY_RENAME_MAP_1[key_name]) if 'distilbert' in key: model_type = 'DistilBert' break if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None for key_name in BERT_KEY_RENAME_MAP_2: if key_name in key: new_key = key.replace(key_name, BERT_KEY_RENAME_MAP_2[key_name]) break if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) ## 下段为fastHan处理所需 old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None for key_name in ['embed.model.encoder']: if key_name in key: new_key = key.replace(key_name, 'bert') break if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) # Instantiate model. config.num_hidden_layers = layer_num model = cls(config, model_type=model_type, *inputs, **kwargs) missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') load(model, prefix='' if hasattr(model, 'bert') else 'bert.') if len(missing_keys) > 0: logger.warning( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) #if len(unexpected_keys) > 0: # logger.warning("Weights from pretrained model not used in {}: {}".format( # model.__class__.__name__, unexpected_keys)) logger.info( f"Load pre-trained {model_type} parameters from file {weights_path}." ) return model
def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2, layer_num=12): super().__init__() self.tokenzier = BertTokenizer.from_pretrained(model_dir_or_name) self.encoder = BertModel.from_pretrained(model_dir_or_name, layer_num=layer_num) self._max_position_embeddings = self.encoder.config.max_position_embeddings # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) self.layers = list(map(int, layers.split(','))) for layer in self.layers: if layer < 0: assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." else: assert layer < encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." assert pool_method in ('avg', 'max', 'first', 'last') self.pool_method = pool_method self.include_cls_sep = include_cls_sep self.pooled_cls = pooled_cls self.auto_truncate = auto_truncate # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] logger.info("Start to generate word pieces for word.") # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值 word_piece_dict = {'[CLS]': 1, '[SEP]': 1} # 用到的word_piece以及新增的 found_count = 0 self._has_sep_in_vocab = '[SEP]' in vocab # 用来判断传入的数据是否需要生成token_ids if '[sep]' in vocab: warnings.warn( "Lower cased [sep] detected, it cannot be correctly recognized as [SEP] by BertEmbedding." ) if "[CLS]" in vocab: warnings.warn( "[CLS] detected in your vocabulary. BertEmbedding will add [CSL] and [SEP] to the begin " "and end of the input automatically, make sure you don't add [CLS] and [SEP] at the begin" " and end.") for word, index in vocab: if index == vocab.padding_idx: # pad是个特殊的符号 word = '[PAD]' elif index == vocab.unknown_idx: word = '[UNK]' word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word) if len(word_pieces) == 1: if not vocab._is_word_no_create_entry( word): # 如果是train中的值, 但是却没有找到 if index != vocab.unknown_idx and word_pieces[ 0] == '[UNK]': # 说明这个词不在原始的word里面 if vocab.word_count[ word] >= min_freq and not vocab._is_word_no_create_entry( word): # 出现次数大于这个次数才新增 word_piece_dict[word] = 1 # 新增一个值 continue for word_piece in word_pieces: word_piece_dict[word_piece] = 1 found_count += 1 original_embed = self.encoder.embeddings.word_embeddings.weight.data # 特殊词汇要特殊处理 embed = nn.Embedding(len(word_piece_dict), original_embed.size(1)) # 新的embed new_word_piece_vocab = collections.OrderedDict() for index, token in enumerate(['[PAD]', '[UNK]']): word_piece_dict.pop(token, None) embed.weight.data[index] = original_embed[ self.tokenzier.vocab[token]] new_word_piece_vocab[token] = index for token in word_piece_dict.keys(): if token in self.tokenzier.vocab: embed.weight.data[len(new_word_piece_vocab)] = original_embed[ self.tokenzier.vocab[token]] else: embed.weight.data[len(new_word_piece_vocab)] = original_embed[ self.tokenzier.vocab['[UNK]']] new_word_piece_vocab[token] = len(new_word_piece_vocab) self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab) self.encoder.embeddings.word_embeddings = embed word_to_wordpieces = [] word_pieces_lengths = [] for word, index in vocab: if index == vocab.padding_idx: # pad是个特殊的符号 word = '[PAD]' elif index == vocab.unknown_idx: word = '[UNK]' word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word) word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces) word_to_wordpieces.append(word_pieces) word_pieces_lengths.append(len(word_pieces)) self._cls_index = self.tokenzier.vocab['[CLS]'] self._sep_index = self.tokenzier.vocab['[SEP]'] self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenzier.vocab[ '[PAD]'] # 需要用于生成word_piece logger.info( "Found(Or segment into word pieces) {} words out of {}.".format( found_count, len(vocab))) self.word_to_wordpieces = np.array(word_to_wordpieces) self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) logger.debug("Successfully generate word pieces.")
def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = True, min_freq=1, only_use_pretrain_bpe=False, truncate_embed=True): super().__init__() self.tokenizer = CamembertTokenizer.from_pretrained(model_dir_or_name) self.encoder = CamembertModel.from_pretrained(model_dir_or_name) self.encoder.resize_token_embeddings(len(self.tokenizer)) self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2 encoder_layer_number = len(self.encoder.encoder.layer) if isinstance(layers, list): self.layers = [int(l) for l in layers] elif isinstance(layers, str): self.layers = list(map(int, layers.split(','))) else: raise TypeError("`layers` only supports str or list[int]") for layer in self.layers: if layer < 0: assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." else: assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." assert pool_method in ('avg', 'max', 'first', 'last') self.pool_method = pool_method self.include_cls_sep = include_cls_sep self.pooled_cls = pooled_cls self.auto_truncate = auto_truncate logger.info("Start to generate word pieces for word.") word_piece_dict = {'<s>': 1, '</s>': 1} found_count = 0 new_add_to_bpe_vocab = 0 unsegment_count = 0 if "<s>" in vocab: warnings.warn("<s> detected in your vocabulary. RobertaEmbedding will add <s> and </s> to the begin " "and end of the input automatically, make sure you don't add <s> and </s> at the begin" " and end.") unique = [] for word, index in vocab: word_pieces = [] word_pieces.extend(self.tokenizer.tokenize( word)) # , add_prefix_space=True)) word_token_ids = self.tokenizer.convert_tokens_to_ids(word_pieces) if 3 in word_token_ids: if word_pieces[word_token_ids.index(3)] not in unique: unique.append(word_pieces[word_token_ids.index(3)]) unsegment_count += 1 if not vocab._is_word_no_create_entry(word): if index != vocab.unknown_idx and word_pieces[0] == '<unk>': if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry( word) and not only_use_pretrain_bpe: word_piece_dict[word] = 1 new_add_to_bpe_vocab += 1 unsegment_count += 1 continue found_count += 1 for word_piece in word_pieces: word_piece_dict[word_piece] = 1 if unsegment_count > 0: logger.info(f"{unsegment_count} words are unsegmented.") word_to_wordpieces = [] word_pieces_lengths = [] for word, index in vocab: if index == vocab.padding_idx: word = '<pad>' elif index == vocab.unknown_idx: word = '<unk>' word_pieces = self.tokenizer.tokenize(word) word_pieces = self.tokenizer.convert_tokens_to_ids(word_pieces) word_to_wordpieces.append(word_pieces) word_pieces_lengths.append(len(word_pieces)) self._cls_index = self.tokenizer.convert_tokens_to_ids('<s>') self._sep_index = self.tokenizer.convert_tokens_to_ids('</s>') self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenizer.convert_tokens_to_ids( '<pad>') self.word_to_wordpieces = np.array(word_to_wordpieces) self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) self.encoder.resize_token_embeddings(len(self.tokenizer)) logger.debug("Successfully generate word pieces.")