Ejemplo n.º 1
0
 def test_instantiate(self, roberta_files):
     assert isinstance(WordLevel(), Model)
     assert isinstance(WordLevel(), WordLevel)
     # The WordLevel model expects a vocab.json using the same format as roberta
     # so we can just try to load with this file
     assert isinstance(WordLevel(roberta_files["vocab"]), Model)
     assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)
Ejemplo n.º 2
0
def get_recurrent_tokenizer(vocab,
                            max_context_tokens,
                            unk_token,
                            pad_token,
                            device="cpu"):
    """
    Return a tokenizer to be used with recurrent-based models
    """
    question_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token))
    question_tokenizer.normalizer = Sequence(
        [StripAccents(), Lowercase(), Strip()])
    question_tokenizer.pre_tokenizer = PreSequence(
        [Whitespace(), Punctuation()])
    question_tokenizer.enable_padding(direction="right",
                                      pad_id=vocab[pad_token],
                                      pad_type_id=1,
                                      pad_token=pad_token)

    context_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token))
    context_tokenizer.normalizer = Sequence(
        [StripAccents(), Lowercase(), Strip()])
    context_tokenizer.pre_tokenizer = PreSequence(
        [Whitespace(), Punctuation()])
    context_tokenizer.enable_padding(
        direction="right",
        pad_id=vocab[pad_token],
        pad_type_id=1,
        pad_token=pad_token,
    )
    context_tokenizer.enable_truncation(max_context_tokens)

    return RecurrentSquadTokenizer(question_tokenizer,
                                   context_tokenizer,
                                   device=device)
Ejemplo n.º 3
0
    def test_can_modify(self):
        model = WordLevel(unk_token="<oov>")

        assert model.unk_token == "<oov>"

        # Modify these
        model.unk_token = "<unk>"
        assert model.unk_token == "<unk>"
Ejemplo n.º 4
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        pad_token: Union[str, AddedToken] = "[PAD]",
        mask_token: Union[str, AddedToken] = "[MASK]",
        lowercase: bool = False,
        unicode_normalizer: Optional[str] = None,
    ):
        if vocab_file is not None:
            logging.info(f"Initiating tokenizer at {vocab_file}")
            tokenizer = Tokenizer(
                WordLevel(vocab=vocab_file, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(WordLevel(unk_token=unk_token))

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

        parameters = {
            "model": "WordLevel",
            "unk_token": unk_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "lowercase": lowercase,
            "unicode_normalizer": unicode_normalizer,
        }

        super().__init__(tokenizer, parameters)
Ejemplo n.º 5
0
    def __init__(
        self,
        vocab_file,
        delimiter,
        lowercase,
        unk_token,
        eos_token,
        add_eos=False,
        add_double_eos=False,
        normalization: Optional[str] = None,
    ):

        try:
            tokenizer = WordLevel(vocab_file, unk_token=unk_token)
            tokenizer = Tokenizer(tokenizer)
        except Exception:
            raise ValueError(
                "Unable to parse file {}. Unknown format. "
                "If you tried to load a model saved through TransfoXLTokenizer,"
                "please note they are not compatible.".format(vocab_file)
            )

        # Create the correct normalization path
        normalizer = []

        # Include unicode normalization
        if normalization:
            normalizer += [unicode_normalizer_from_str(normalization)]

        # Include case normalization
        if lowercase:
            normalizer += [Lowercase()]

        # Strip normalizer at the end
        normalizer += [Strip(left=True, right=True)]

        if len(normalizer) > 0:
            tokenizer.normalizer = Sequence(normalizer) if len(normalizer) > 1 else normalizer[0]

        # Setup the splitter
        tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter) if delimiter else WhitespaceSplit()

        if add_double_eos:
            tokenizer.post_processor = BertProcessing(
                (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))
            )

        parameters = {
            "model": "TransfoXLModel",
            "add_eos": add_eos,
            "add_double_eos": add_double_eos,
            "unk_token": unk_token,
            "eos_token": eos_token,
            "delimiter": delimiter,
            "lowercase": lowercase,
        }

        super().__init__(tokenizer, parameters)
Ejemplo n.º 6
0
    def __create_tokenizer(self, files):

        # Create, train and save the tokenizer.
        print("Preparing tokenizer...")
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = WhitespaceSplit()
        trainer = WordLevelTrainer(
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
        tokenizer.train(files=files, trainer=trainer)
        return tokenizer
Ejemplo n.º 7
0
def get_model(name: str):
    if name == "wordpiece":
        return WordPiece(unk_token=UNK_TOKEN)
    elif name == "bpe":
        return BPE(unk_token=UNK_TOKEN)
    elif name == "unigram":
        return Unigram()
    elif name == "word":
        return WordLevel(unk_token=UNK_TOKEN)
    else:
        raise AssertionError(f"{name} type model is not granted.")
Ejemplo n.º 8
0
 def test_works_in_simple_pipeline(self):
     pretok = self.dict.pre_tokenizer()
     vocab = {
         "[UNK]": 0,
         "京都": 1,
         "に": 2,
         "行く": 3
     }
     tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]"))
     tok.pre_tokenizer = pretok
     res = tok.encode("京都へ行く")
     self.assertEqual(res.ids, [1, 0, 3])
Ejemplo n.º 9
0
 def test_with_handler(self):
     def _handler(index, sentence: tokenizers.NormalizedString, ml: MorphemeList):
         return [tokenizers.NormalizedString(ml[0].part_of_speech()[0]), tokenizers.NormalizedString(str(len(ml)))]
     pretok = self.dict.pre_tokenizer(sudachipy.SplitMode.A, handler=_handler)
     vocab = {
         "[UNK]": 0,
         "名詞": 6,
         "4": 7,
     }
     tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]"))
     tok.pre_tokenizer = pretok
     res = tok.encode("外国人参政権")
     self.assertEqual(res.ids, [6, 7])
Ejemplo n.º 10
0
 def test_works_with_different_split_mode(self):
     pretok = self.dict.pre_tokenizer(sudachipy.SplitMode.A)
     vocab = {
         "[UNK]": 0,
         "外国": 1,
         "参政": 2,
         "権": 3,
         "人": 5,
         "外国人参政権": 4
     }
     tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]"))
     tok.pre_tokenizer = pretok
     res = tok.encode("外国人参政権")
     self.assertEqual(res.ids, [1, 5, 2, 3])
Ejemplo n.º 11
0
    def __init__(
        self,
        vocab_file,
        delimiter,
        lowercase,
        unk_token,
        eos_token,
        add_eos=False,
        add_double_eos=False,
        normalization: Optional[str] = None,
    ):

        tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token)
        tokenizer = Tokenizer(tokenizer)

        # Create the correct normalization path
        normalizer = []

        # Include unicode normalization
        if normalization:
            normalizer += [unicode_normalizer_from_str(normalization)]

        # Include case normalization
        if lowercase:
            normalizer += [Lowercase()]

        if len(normalizer) > 0:
            tokenizer.normalizer = Sequence(
                normalizer) if len(normalizer) > 1 else normalizer[0]

        # Setup the splitter
        tokenizer.pre_tokenizer = CharDelimiterSplit(
            delimiter) if delimiter else WhitespaceSplit()

        if add_double_eos:
            tokenizer.post_processor = BertProcessing(
                (eos_token, tokenizer.token_to_id(eos_token)),
                (eos_token, tokenizer.token_to_id(eos_token)))

        parameters = {
            "model": "TransfoXLModel",
            "add_eos": add_eos,
            "add_double_eos": add_double_eos,
            "unk_token": unk_token,
            "eos_token": eos_token,
            "delimiter": delimiter,
            "lowercase": lowercase,
        }

        super().__init__(tokenizer, parameters)
Ejemplo n.º 12
0
    def test_instantiate(self, roberta_files):
        assert isinstance(WordLevel(), Model)
        assert isinstance(WordLevel(), WordLevel)

        vocab = {"a": 0, "b": 1, "ab": 2}
        assert isinstance(WordLevel(vocab), Model)
        assert isinstance(WordLevel(vocab), WordLevel)
        assert isinstance(WordLevel.from_file(roberta_files["vocab"]),
                          WordLevel)

        # The WordLevel model expects a vocab.json using the same format as roberta
        # so we can just try to load with this file
        with pytest.deprecated_call():
            assert isinstance(WordLevel(roberta_files["vocab"]), Model)
        with pytest.deprecated_call():
            assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)
def main(args):
    # copy from https://github.com/xinjli/allosaurus
    ipa0 = [
        'I', 'a', 'aː', 'ã', 'ă', 'b', 'bʲ', 'bʲj', 'bʷ', 'bʼ', 'bː', 'b̞',
        'b̤', 'b̥', 'c', 'd', 'dʒ', 'dʲ', 'dː', 'd̚', 'd̥', 'd̪', 'd̯', 'd͡z',
        'd͡ʑ', 'd͡ʒ', 'd͡ʒː', 'd͡ʒ̤', 'e', 'eː', 'e̞', 'f', 'fʲ', 'fʷ', 'fː',
        'g', 'gʲ', 'gʲj', 'gʷ', 'gː', 'h', 'hʷ', 'i', 'ij', 'iː', 'i̞', 'i̥',
        'i̯', 'j', 'k', 'kx', 'kʰ', 'kʲ', 'kʲj', 'kʷ', 'kʷʼ', 'kʼ', 'kː',
        'k̟ʲ', 'k̟̚', 'k͡p̚', 'l', 'lʲ', 'lː', 'l̪', 'm', 'mʲ', 'mʲj', 'mʷ',
        'mː', 'n', 'nj', 'nʲ', 'nː', 'n̪', 'n̺', 'o', 'oː', 'o̞', 'o̥', 'p',
        'pf', 'pʰ', 'pʲ', 'pʲj', 'pʷ', 'pʷʼ', 'pʼ', 'pː', 'p̚', 'q', 'r', 'rː',
        's', 'sʲ', 'sʼ', 'sː', 's̪', 't', 'ts', 'tsʰ', 'tɕ', 'tɕʰ', 'tʂ',
        'tʂʰ', 'tʃ', 'tʰ', 'tʲ', 'tʷʼ', 'tʼ', 'tː', 't̚', 't̪', 't̪ʰ', 't̪̚',
        't͡s', 't͡sʼ', 't͡ɕ', 't͡ɬ', 't͡ʃ', 't͡ʃʲ', 't͡ʃʼ', 't͡ʃː', 'u', 'uə',
        'uː', 'u͡w', 'v', 'vʲ', 'vʷ', 'vː', 'v̞', 'v̞ʲ', 'w', 'x', 'x̟ʲ', 'y',
        'z', 'zj', 'zʲ', 'z̪', 'ä', 'æ', 'ç', 'çj', 'ð', 'ø', 'ŋ', 'ŋ̟', 'ŋ͡m',
        'œ', 'œ̃', 'ɐ', 'ɐ̞', 'ɑ', 'ɑ̱', 'ɒ', 'ɓ', 'ɔ', 'ɔ̃', 'ɕ', 'ɕː', 'ɖ̤',
        'ɗ', 'ə', 'ɛ', 'ɛ̃', 'ɟ', 'ɡ', 'ɡʲ', 'ɡ̤', 'ɡ̥', 'ɣ', 'ɣj', 'ɤ', 'ɤɐ̞',
        'ɤ̆', 'ɥ', 'ɦ', 'ɨ', 'ɪ', 'ɫ', 'ɯ', 'ɯ̟', 'ɯ̥', 'ɰ', 'ɱ', 'ɲ', 'ɳ',
        'ɴ', 'ɵ', 'ɸ', 'ɹ', 'ɹ̩', 'ɻ', 'ɻ̩', 'ɽ', 'ɾ', 'ɾj', 'ɾʲ', 'ɾ̠', 'ʀ',
        'ʁ', 'ʁ̝', 'ʂ', 'ʃ', 'ʃʲː', 'ʃ͡ɣ', 'ʈ', 'ʉ̞', 'ʊ', 'ʋ', 'ʋʲ', 'ʌ', 'ʎ',
        'ʏ', 'ʐ', 'ʑ', 'ʒ', 'ʒ͡ɣ', 'ʔ', 'ʝ', 'ː', 'β', 'β̞', 'θ', 'χ', 'ә', 'ḁ'
    ]
    ipa1, ipa2, ipa3 = ipa0.copy(), ipa0.copy(), ipa0.copy()
    random.shuffle(ipa1)
    random.shuffle(ipa2)
    random.shuffle(ipa3)
    # randomly joined to form training data
    passage0 = ' '.join(ipa0)
    passage1 = ' '.join(ipa1)
    passage2 = ' '.join(ipa2)
    passage3 = ' '.join(ipa3)
    data = [passage0, passage1, passage2, passage3]
    # setup
    tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
    # trainer = WordLevelTrainer(vocab_size=300, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    trainer = WordLevelTrainer(
        vocab_size=300,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    tokenizer.pre_tokenizer = Whitespace()
    # train the tokenizer
    tokenizer.train_from_iterator(data, trainer=trainer)
    tokenizer.save(args.outdir + '/ipa_tokenizer.json')
Ejemplo n.º 14
0
    def __init__(
        self,
        vocab_file,
        sep_token="<sep>",
        cls_token="<cls>",
        pad_token="<pad>",
        mask_token="<mask>",
        lowercase: bool = True,
    ):

        tokenizer = Tokenizer(WordLevel(vocab_file, unk_token=unk_token))
        tokenizer.normalizer = Strip()
        tokenizer.pre_tokenizer = CharDelimiterSplit(" ")

        tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        parameters = {
            "model": "WordLevel",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "lowercase": lowercase,
        }

        super().__init__(tokenizer, parameters)
Ejemplo n.º 15
0
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import WordLevel

VOCAB_FILE = "data/tx1_vocab.txt"

with open(VOCAB_FILE, "r") as f:
    words = list(set(f.read().strip().split("\n")))

vocab = {}
for i, word in enumerate(["<pad>", "<unk>"] + words):
    vocab[word] = i

tokenizer = Tokenizer(WordLevel(vocab, unk_token="<unk>"))
tokenizer.enable_padding(pad_token="<pad>")
tokenizer.pre_tokenizer = Whitespace()

tokenizer.save("data/tokenizer-LakhNES-tx1.json")
Ejemplo n.º 16
0
import json

data_path = Path('/workspace/poetry2021.gt/data/pan_tadeusz5')
dataset_path = data_path / 'dataset'
vocab_path = data_path / 'vocab.json'
tokenizer_tmp_path = data_path / 'tokenizer_tmp'
tokenizer_path = data_path / 'tokenizer'

text_tokenizer = TextTokenizer(dataset_path)
text_tokenizer.load_vocab(vocab_path)

vocab = text_tokenizer.vocab
vocab_count = len(vocab.keys())
vocab.update({'<|endoftext|>': vocab_count})

tokenizer_tmp = Tokenizer(WordLevel(text_tokenizer.vocab))
tokenizer_tmp.pre_tokenizer = CharDelimiterSplit(' ')

tokenizer_tmp.post_processor = BertProcessing(
    ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")),
    ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")),
)

tokenizer_tmp_path.mkdir(parents=True, exist_ok=True)
tokenizer_tmp.save(str(tokenizer_tmp_path / "tokenizer.json"))

# Re-create as GPT2 compatible tokenizer


class GPT2CompatibleTokenizer(PreTrainedTokenizerFast):
    def save_vocabulary(self,
Ejemplo n.º 17
0
    "CHEF_CHECK": 6,
    "CHEF_DO": 7,
    "MOVE_CONTENTS": 8,
}
k = len(output_vocab)
with open("../data/res2idx.json", 'r') as f:
    for w, i in json.load(f).items():
        output_vocab[w] = k
        k += 1
with open("../data/arg2idx.json", 'r') as f:
    for w, i in json.load(f).items():
        output_vocab[w.replace('-', '_')] = k
        k += 1

output_vocab = {w: i for i, w in enumerate(output_vocab)}
output_tokenizer = Tokenizer(WordLevel(output_vocab, ))
output_tokenizer.pre_tokenizer = Whitespace()

t = output_tokenizer.encode_batch(
    ["SERVE MOVE_CONTENTS", "SERVE MOVE_CONTENTS PUT"])
# print (t)

csv_file = '../data/seq2seq_4335716.csv'
input_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
input_tokenizer.bos_token = input_tokenizer.cls_token
input_tokenizer.eos_token = input_tokenizer.sep_token

val_data = load_dataset('csv', data_files=csv_file, split='train[90%:]')
train_data = load_dataset('csv', data_files=csv_file, split='train[:90%]')
# print(val_data)
# print(train_data)
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
from tokenizers.processors import TemplateProcessing

t = Tokenizer(WordLevel(unk_token="[UNK]"))
t.pre_tokenizer = Whitespace()

trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"])
t.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    # ,
    # pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 2),
        ("[SEP]", 3),
    ])

files = ['tok-train-shuf-tgt.tsv']
t.train(files, trainer)

t.save("code_tokenizer.json")
Ejemplo n.º 19
0
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True):
    """
    if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining)
    else: load the tokenizer if it exists
    """
    assert dataset in VALID_DATASETS
    assert style in VALID_TOKENIZATIONS

    tpath_expected = default_tpath(dataset, style)

    train = True
    if not force_retrain and os.path.isfile(tpath_expected):
        tokenizer = Tokenizer.from_file(tpath_expected)
        train = False
    else:
        print('%s tokenizer file does not exist; training new tokenizer' %
              tpath_expected)

    if train:

        # load data associated with one of the valid datasets (from /data/ directory)
        datafiles = load_dataset(dataset)

        # Steps for each algo (e.g. BPE):
        # - init Tokenizer using algo
        # - specify algo specific trainer
        # - specify any pre-processing of text (will affect decoding)
        #   see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders
        # - different training calls if its the arxiv dataset or wikitext
        #   see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

        if style == 'BPE':
            tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
            trainer = BpeTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = ByteLevel()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.ByteLevel()

        else:
            assert style == 'WordLevel'
            tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
            trainer = WordLevelTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = Whitespace()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.WordPiece(
            )  # WordPiece seems to work (adds back spaces)

        # Save to tokenizers directory
        tokenizer.save(tpath_expected)

    # Generate vocab object based on tokenizer.decoder() method
    # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere...
    # Features we need to match:
    #   from torchtext.legacy.vocab import Vocab as RetiredVocab
    #   ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...)
    #   data = [torch.tensor([vocab[token] for token in tokenizer(item)],
    #                         dtype=torch.long) for item in raw_text_iter]
    #   tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long)
    #   running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])])
    #   unk_index = vocab.unk_index
    vocab = None

    return tokenizer, vocab
Ejemplo n.º 20
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        unk_token: Union[str, AddedToken] = "<unk>",
        sep_token: Union[str, AddedToken] = "<sep>",
        cls_token: Union[str, AddedToken] = "<cls>",
        pad_token: Union[str, AddedToken] = "<pad>",
        mask_token: Union[str, AddedToken] = "<mask>",
        lowercase: bool = False,
        unicode_normalizer: Optional[str] = None,
    ):
        if vocab_file is not None:
            tokenizer = Tokenizer(WordLevel(vocab_file))
        else:
            tokenizer = Tokenizer(WordLevel())

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

        if vocab_file is not None:
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(str(cls_token))
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")

            tokenizer.post_processor = processors.BertProcessing(
                (str(sep_token), sep_token_id), (str(cls_token), cls_token_id))

        parameters = {
            "model": "WordLevel",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "lowercase": lowercase,
            "unicode_normalizer": unicode_normalizer,
        }

        super().__init__(tokenizer, parameters)
Ejemplo n.º 21
0
def train_custom_tokenizer(dataset,
                           token_model,
                           tknzr_file,
                           vocab_size,
                           vocab=None,
                           pretrain_fast=False,
                           max_input_chars_per_word=None,
                           eos_token=None,
                           bos_token=None,
                           pad_token=None,
                           mask_token=None,
                           unk_token=None):
    """
    Building a Tokenizer using HuggingFace library. The pipeline seems to be:

        - Model           : algorithm that tokenizes, it is a mandatory
                            component. There are only 4 models implemented
                            (BPE, Unigram, WordLevel, WordPiece)
        - Normalizer      : some preprocessing that could happen before, but
                            doesn't necessarily have to
        - Pre-Tokenizer   : splitting the input according to some rules
        - Post-Processing : needing to add some tokens/input after (mostly seems
                            to be eos, bos tokens)
        - Decoder         : certain previous pipeline steps need to be reversed
                            for proper decoding
        - Trainer         : The corresponding training algorithm for the model

    Note : Some pre-processing might need to happen beforehand in previous
            functions (might be easier using pandas before)

    Input
        token_model (str)        : algorithm to use for tokenization
        dataset (class)          : a python iterator that goes through the data
                                    to be used for training
        token_dir (str)          : directory with tokenizers
        vocab_size (int)         : size of the vocabulary to use
        tokenFilename (str)     : filename of particular token we want to
                                    train. Will overwrite previously save files.
        vocab (list of str)      : models other than BPE can use non-mandatory
                                    vocab as input
        max_input_chars_per_word : used for WordPiece

    Output
        tokenizer                : huggingFace Tokenizer object, our fully
                                    trainer tokenizer

    """
    special_token_lst = [
        pad_token, bos_token, eos_token, mask_token, unk_token
    ]

    # NFKC
    normalizer_lst = []
    pre_tokenizer_lst = [Whitespace, ByteLevel]
    decoder_lst = []

    bos_idx = special_token_lst.index(bos_token)
    eos_idx = special_token_lst.index(eos_token)

    if token_model == 'BPE':
        model = BPE(unk_token=unk_token)
        Trainer = BpeTrainer
    elif token_model == 'Unigram':
        model = Unigram(vocab=vocab)
        Trainer = UnigramTrainer
    elif token_model == 'WordLevel':
        model = WordLevel(unk_token=unk_token, vocab=vocab)
        Trainer = WordLevelTrainer
    elif token_model == 'WordPiece':
        model = WordPiece(unk_token=unk_token,
                          vocab=vocab,
                          max_input_chars_per_word=max_input_chars_per_word)
        Trainer = WordPieceTrainer
    else:
        error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \
                    % VALID_TOKENIZATIONS
        raise SystemExit(error_msg)

    # instantiation
    tokenizer = Tokenizer(model)

    # Select a tokenization trainer
    if vocab_size is None:
        trainer = Trainer(show_progress=True, special_tokens=special_token_lst)
    else:
        trainer = Trainer(vocab_size=vocab_size,
                          show_progress=True,
                          special_tokens=special_token_lst)

    # Set the normalizer
    tokenizer.normalizer = normalizers.Sequence(
        [fcn() for fcn in normalizer_lst])

    # Set the pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
        [fcn() for fcn in pre_tokenizer_lst])

    # Set the post-processing
    tokenizer.post_processor = processors.TemplateProcessing(
        single=bos_token + " $A " + eos_token,
        special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)],
        #  pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1",
    )

    # Set the decoder
    if ByteLevel in pre_tokenizer_lst:
        tokenizer.decoder = decoders.ByteLevel()
    if Metaspace in pre_tokenizer_lst:
        tokenizer.decoder = decoders.Metaspace()
    if token_model == 'WordPiece':
        tokenizer.decoder = decoders.WordPiece()

    # creating iterator
    def batch_iterator():
        for i in np.arange(0, len(dataset)):
            yield dataset[i]

    # train call
    tokenizer.train_from_iterator(trainer=trainer,
                                  iterator=batch_iterator(),
                                  length=len(dataset))

    if Path(tknzr_file).exists():
        print(f"Warning : overwriting previously save tokenizer with\
                        same filename ( {tknzr_file} ).")
    tokenizer.save(tknzr_file)

    if pretrain_fast:
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file)
    else:
        tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file)
    tokenizer.pad_token = pad_token
    tokenizer.mask_token = mask_token

    return tokenizer
Ejemplo n.º 22
0
    pre_tokenizer = Whitespace()
    tokenized_texts = [[w for w, _ in pre_tokenizer.pre_tokenize_str(t)]
                       for t in texts]

    c = Counter()
    for text in tokenized_texts:
        c.update(text)

    token2id = {
        word: i + 1
        for i, (word, count) in enumerate(c.most_common(max_vocab_size))
    }
    # usually, UNK is assigned index 0 or 1
    token2id[unk_token] = 0

    tokenizer = tokenizers.Tokenizer(WordLevel(token2id, unk_token))
    tokenizer.pre_tokenizer = pre_tokenizer
    return tokenizer


def accuracy(probs, targets):
    """Computes accuracy given predicted probabilities and expected labels.

    Args:
        probs: torch.FloatTensor[batch_size, 1], probabilities of a positive class
        targets: torch.LongTensor[batch_size, 1], true classes

    Returns:
        0 <= float <= 1, proportion of correct predictions
    """
    predictions = (probs >= 0.5).flatten()
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordLevel
from tokenizers.processors import BertProcessing, TemplateProcessing
from tokenizers import trainers
from transformers import BertForMaskedLM
from transformers import BertTokenizerFast
from transformers import BertConfig

import ipdb
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

uid_task_id_sequence_path = 'data/feature_sequence/uid_task_id.txt'
paths = [str(x) for x in Path(".").glob('data/feature_sequence/*.txt')]

tokenizer = Tokenizer(WordLevel())
tokenizer.pre_tokenizer = Whitespace()
# trainer = trainers.BpeTrainer(
trainer = trainers.WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(trainer, [uid_task_id_sequence_path])
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# tokenizer.save_model("tmp")
Ejemplo n.º 24
0
    def __init__(
        self,
        target_vocab,
    ):
        special_tokens = {
            "pad_token": "[PAD]",
            "unk_token": "[UNK]",
            "sep_token": "[SEP]",
            "cls_token": "[CLS]",
            "mask_token": "[MASK]",
        }

        vocab = {}
        vocab[special_tokens["pad_token"]] = 0

        tkn_idx = 1
        unused_ctr = 0

        # not sure whether that's relevant, but fill 1..99  and 105...999
        # with unused tokens to keep BERT's tokenizer style
        # as a result, one can easily identify special tokens:
        # 0 is padding
        # 1xx are other special tokens
        # any four-digit tokens are actual payload
        fill_tokens = False

        if(fill_tokens):
            while(tkn_idx < 100):
                vocab[f"[unused{unused_ctr}]"] = tkn_idx
                tkn_idx += 1
                unused_ctr += 1

        for token in ["unk_token", "cls_token", "sep_token", "mask_token"]:
            vocab[special_tokens[token]] = tkn_idx
            tkn_idx += 1

        if(fill_tokens):
            while(tkn_idx < 1000):
                vocab[f"[unused{unused_ctr}]"] = tkn_idx
                tkn_idx += 1
                unused_ctr += 1

        for word in target_vocab:
            vocab[word] = tkn_idx
            tkn_idx += 1

        tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token=special_tokens["unk_token"]))
        tokenizer.add_special_tokens(list(special_tokens.values()))
        tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

        sep_token_id = tokenizer.token_to_id(special_tokens["sep_token"])
        cls_token_id = tokenizer.token_to_id(special_tokens["cls_token"])

        tokenizer.post_processor = processors.BertProcessing(
            (special_tokens["sep_token"], sep_token_id), (special_tokens["cls_token"], cls_token_id)
        )

        parameters = special_tokens
        parameters["model"] = "WordLevel"

        super().__init__(tokenizer, parameters)

        tokenizer.save(PRETRAINED_TOKENIZER_FILE)
Ejemplo n.º 25
0
    train_csv_df.to_csv(config['train_csv'], index=False, header=True)

    # Labelled test CSV file
    print("Save labelled csv for inference ", config['test_csv'])
    test_csv_df.to_csv(config['test_csv'], index=False, header=True)

print("Setup tokenizers...")

unknown_word = 'unknown_word'
full_set = set(list(count_vector.vocabulary_.keys()) + list(word_list.keys()))
#full_set = set(list(count_vector.vocabulary_.keys()))

print("Number of words : (This has to be in config)", len(full_set) + 2)

vocab = {
    w: i
    for i, w in enumerate([unknown_word, 'dumb_token'] + list(full_set))
}
tokenizer = tokenizers.Tokenizer(WordLevel(vocab, unknown_word))
tokenizer.pre_tokenizer = Whitespace()

print("Use padding length ", config['padding_length'])
tokenizer.enable_padding(length=int(config['padding_length']))

# Save tokenizer
recompute = False
if recompute:
    print("Save tokenizer ", config['token_config'])
    tokenizer.save(config['token_config'])
    tokenizer = tokenizers.Tokenizer.from_file(config['token_config'])