def __init__(self, path_to_text_file: str, tokenizer_in: Tokenizer, tokenizer_out: Tokenizer, max_sequence_length: int, sep: str, **kwargs): logger.info("Processing file: {}".format(path_to_text_file)) self.pad_token_in = tokenizer_in.get_vocab()['<PAD>'] self.pad_token_out = tokenizer_out.get_vocab()['<PAD>'] self.max_sequence_length = max_sequence_length with open(path_to_text_file, "r") as file: texts = file.readlines() texts = list(map(lambda x: x.split(sep), texts)) texts = list(map(lambda x: x[0:2], texts)) self.texts_in = [] self.texts_in_length = [] self.texts_out = [] for i in tqdm(range(len(texts)), desc="Tokenization...."): text_in_ids = tokenizer_in.encode(texts[i][0]).ids texts_out_ids = tokenizer_out.encode(texts[i][1]).ids if len(text_in_ids) and len(texts_out_ids) <= max_sequence_length: self.texts_in.append(text_in_ids) self.texts_in_length.append(len(text_in_ids)) self.texts_out.append(texts_out_ids) logger.info("# Texts: {}".format(len(self.texts_in)))
def test_get_vocab(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can retrieve vocab with added tokens vocab = tokenizer.get_vocab(with_added_tokens=True) assert vocab == {"is": 2, "john": 3, "my": 0, "name": 1, "pair": 4} # Can retrieve vocab without added tokens vocab = tokenizer.get_vocab(with_added_tokens=False) assert vocab == {}
class HuggingFaceWordLevelTokenizer(TokenizerBase): def __init__(self, **kwargs): super().__init__(**kwargs) from tokenizers import Tokenizer, models, normalizers, pre_tokenizers self.tokenizer = Tokenizer( models.WordLevel(unk_token=self.unknown_token)) self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() if self.lower: self.tokenizer.normalizer = normalizers.Lowercase() def fit(self, *, texts=None, text_batch_iter=None, max_tokens=None): from tokenizers import trainers trainer = trainers.WordLevelTrainer(vocab_size=self.max_vocab_size, special_tokens=list( self.special_tokens)) self.tokenizer.train_from_iterator(text_batch_iter, trainer=trainer) self.token_to_id = self.tokenizer.get_vocab() self.id_to_token = { token_id: token for token, token_id in self.token_to_id.items() } def encode(self, texts): id_seqs = self.tokenizer.encode_batch(texts) id_seqs = [id_seq.ids for id_seq in id_seqs] return self._post_process( id_seqs, pad_id=self.token_to_id[self.pad_token] if self.pad_token else None, sos_id=self.token_to_id[self.sos_token] if self.sos_token else None, eos_id=self.token_to_id[self.eos_token] if self.eos_token else None, ) def decode(self, id_seqs): self.tokenizer.decode_batch(id_seqs)
def build_new_vocab(): tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() # files = [f"/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-{split}-factoid-7b.json" for split in ["train_split", "dev"]] files = "/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-train-factoid-7b.json" with open(files) as f: file = json.load(f) contexts = [] for question in file['data']: for paragraph in question['paragraphs']: contexts.append(paragraph['context']) tokenizer.train_from_iterator(contexts, trainer) additional_vocab = [k for k, v in tokenizer.get_vocab().items()] tokenizer.save("tokenizer/tokenizer-bioasq.json") return additional_vocab
class SentencePieceBPETokenizer: """Custom SentencePiece tokenizer""" unk_token = '<unk>' pad_token = '<pad>' def __init__(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None, dropout: float = None, max_length: Optional[int] = 64) -> None: """Constructor Args: vocab (Dict[str, int]): A dictionary of string keys and their ids. merges (List[Tuple[str, str]]): A list of pairs of tokens. dropout (float): BPE dropout max_length (int, optional): The max length at which to truncate. Defaults to `64`. """ self.tokenizer = Tokenizer( BPE(vocab, merges, dropout=dropout, unk_token=self.unk_token)) self.tokenizer.normalizer = BertNormalizer() # noqa self.tokenizer.pre_tokenizer = pre_tokenizers.Metaspace() # noqa self.tokenizer.decoder = decoders.Metaspace() # noqa self.tokenizer.add_special_tokens([self.pad_token, self.unk_token]) self.tokenizer.enable_padding(pad_token=self.pad_token) self.tokenizer.enable_truncation(max_length) @classmethod def train(cls, dataset: Sequence[str], vocab_size: int = 1000, min_frequency: int = 2, dropout: float = 0.0, max_length: Optional[int] = 64) -> 'SentencePieceBPETokenizer': instance = cls(dropout=dropout, max_length=max_length) trainer = trainers.BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[cls.pad_token, cls.unk_token]) instance.tokenizer.train_from_iterator(dataset, trainer=trainer) instance.tokenizer.model.dropout = None return instance @property def vocab_size(self): return len(self.tokenizer.get_vocab()) def serialize(self): return self.tokenizer.to_str() @classmethod def deserialize(cls, s: str) -> 'SentencePieceBPETokenizer': tokenizer = cls() tokenizer.tokenizer = Tokenizer.from_str(s) return tokenizer def encode(self, text: str) -> Dict[str, Any]: encoding = self.tokenizer.encode(text) outputs = { 'ids': torch.tensor(encoding.ids), 'mask': torch.tensor(encoding.attention_mask), 'spans': encoding.offsets, } return outputs def encode_batch(self, batch: List[str]): encodings = self.tokenizer.encode_batch(batch) outputs = { 'ids': torch.tensor([e.ids for e in encodings]), 'mask': torch.tensor([e.attention_mask for e in encodings]), 'spans': [e.offsets for e in encodings], } return outputs