Beispiel #1
0
    def __init__(self, path_to_text_file: str, tokenizer_in: Tokenizer,
                 tokenizer_out: Tokenizer, max_sequence_length: int, sep: str,
                 **kwargs):

        logger.info("Processing file: {}".format(path_to_text_file))

        self.pad_token_in = tokenizer_in.get_vocab()['<PAD>']
        self.pad_token_out = tokenizer_out.get_vocab()['<PAD>']
        self.max_sequence_length = max_sequence_length

        with open(path_to_text_file, "r") as file:
            texts = file.readlines()

        texts = list(map(lambda x: x.split(sep), texts))
        texts = list(map(lambda x: x[0:2], texts))

        self.texts_in = []
        self.texts_in_length = []
        self.texts_out = []

        for i in tqdm(range(len(texts)), desc="Tokenization...."):
            text_in_ids = tokenizer_in.encode(texts[i][0]).ids
            texts_out_ids = tokenizer_out.encode(texts[i][1]).ids

            if len(text_in_ids) and len(texts_out_ids) <= max_sequence_length:
                self.texts_in.append(text_in_ids)
                self.texts_in_length.append(len(text_in_ids))
                self.texts_out.append(texts_out_ids)

        logger.info("# Texts: {}".format(len(self.texts_in)))
    def test_get_vocab(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can retrieve vocab with added tokens
        vocab = tokenizer.get_vocab(with_added_tokens=True)
        assert vocab == {"is": 2, "john": 3, "my": 0, "name": 1, "pair": 4}

        # Can retrieve vocab without added tokens
        vocab = tokenizer.get_vocab(with_added_tokens=False)
        assert vocab == {}
class HuggingFaceWordLevelTokenizer(TokenizerBase):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        from tokenizers import Tokenizer, models, normalizers, pre_tokenizers

        self.tokenizer = Tokenizer(
            models.WordLevel(unk_token=self.unknown_token))
        self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        if self.lower:
            self.tokenizer.normalizer = normalizers.Lowercase()

    def fit(self, *, texts=None, text_batch_iter=None, max_tokens=None):
        from tokenizers import trainers

        trainer = trainers.WordLevelTrainer(vocab_size=self.max_vocab_size,
                                            special_tokens=list(
                                                self.special_tokens))
        self.tokenizer.train_from_iterator(text_batch_iter, trainer=trainer)
        self.token_to_id = self.tokenizer.get_vocab()
        self.id_to_token = {
            token_id: token
            for token, token_id in self.token_to_id.items()
        }

    def encode(self, texts):
        id_seqs = self.tokenizer.encode_batch(texts)
        id_seqs = [id_seq.ids for id_seq in id_seqs]
        return self._post_process(
            id_seqs,
            pad_id=self.token_to_id[self.pad_token]
            if self.pad_token else None,
            sos_id=self.token_to_id[self.sos_token]
            if self.sos_token else None,
            eos_id=self.token_to_id[self.eos_token]
            if self.eos_token else None,
        )

    def decode(self, id_seqs):
        self.tokenizer.decode_batch(id_seqs)
Beispiel #4
0
def build_new_vocab():

    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

    trainer = BpeTrainer(
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    tokenizer.pre_tokenizer = Whitespace()

    # files = [f"/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-{split}-factoid-7b.json" for split in ["train_split", "dev"]]
    files = "/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-train-factoid-7b.json"

    with open(files) as f:
        file = json.load(f)
    contexts = []
    for question in file['data']:
        for paragraph in question['paragraphs']:
            contexts.append(paragraph['context'])

    tokenizer.train_from_iterator(contexts, trainer)
    additional_vocab = [k for k, v in tokenizer.get_vocab().items()]

    tokenizer.save("tokenizer/tokenizer-bioasq.json")
    return additional_vocab
Beispiel #5
0
class SentencePieceBPETokenizer:
    """Custom SentencePiece tokenizer"""
    unk_token = '<unk>'
    pad_token = '<pad>'

    def __init__(self,
                 vocab: Dict[str, int] = None,
                 merges: List[Tuple[str, str]] = None,
                 dropout: float = None,
                 max_length: Optional[int] = 64) -> None:
        """Constructor

        Args:
            vocab (Dict[str, int]): A dictionary of string keys and their ids.
            merges (List[Tuple[str, str]]): A list of pairs of tokens.
            dropout (float): BPE dropout
            max_length (int, optional): The max length at which to truncate.
                Defaults to `64`.
        """
        self.tokenizer = Tokenizer(
            BPE(vocab, merges, dropout=dropout, unk_token=self.unk_token))
        self.tokenizer.normalizer = BertNormalizer()  # noqa
        self.tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()  # noqa
        self.tokenizer.decoder = decoders.Metaspace()  # noqa
        self.tokenizer.add_special_tokens([self.pad_token, self.unk_token])

        self.tokenizer.enable_padding(pad_token=self.pad_token)
        self.tokenizer.enable_truncation(max_length)

    @classmethod
    def train(cls,
              dataset: Sequence[str],
              vocab_size: int = 1000,
              min_frequency: int = 2,
              dropout: float = 0.0,
              max_length: Optional[int] = 64) -> 'SentencePieceBPETokenizer':
        instance = cls(dropout=dropout, max_length=max_length)
        trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=[cls.pad_token, cls.unk_token])
        instance.tokenizer.train_from_iterator(dataset, trainer=trainer)
        instance.tokenizer.model.dropout = None
        return instance

    @property
    def vocab_size(self):
        return len(self.tokenizer.get_vocab())

    def serialize(self):
        return self.tokenizer.to_str()

    @classmethod
    def deserialize(cls, s: str) -> 'SentencePieceBPETokenizer':
        tokenizer = cls()
        tokenizer.tokenizer = Tokenizer.from_str(s)
        return tokenizer

    def encode(self, text: str) -> Dict[str, Any]:
        encoding = self.tokenizer.encode(text)
        outputs = {
            'ids': torch.tensor(encoding.ids),
            'mask': torch.tensor(encoding.attention_mask),
            'spans': encoding.offsets,
        }
        return outputs

    def encode_batch(self, batch: List[str]):
        encodings = self.tokenizer.encode_batch(batch)
        outputs = {
            'ids': torch.tensor([e.ids for e in encodings]),
            'mask': torch.tensor([e.attention_mask for e in encodings]),
            'spans': [e.offsets for e in encodings],
        }
        return outputs