Beispiel #1
0
def train_tokenizer(args):
    """[summary]

    Arguments:
        args {[dictionary]} -- [arguments객체]
    """

    # Tokenizer train
    morpheme_func = None

    if args.tokenizer.pretokenizer_type == "khaiii":
        api = KhaiiiApi()
        morpheme_func = api.analyze
    elif args.tokenizer.pretokenizer_type == "mecab":
        mecab = Mecab()
        morpheme_func = mecab.morphs

    # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe"
    if args.tokenizer.tokenizer_type == "bbpe":
        # tokenizer = BytelevelBPETokenizer()
        tokenizer = Tokenizer(BPE())
        # tokenizer.pre_tokenizer = BertPreTokenizer()
        trainer = BpeTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )
    elif args.tokenizer.tokenizer_type == "cbpe":
        tokenizer = Tokenizer(BPE())
        tokenizer.pre_tokenizer = CharDelimiterSplit
        trainer = BpeTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )
    elif args.tokenizer.tokenizer_type == "wp":
        tokenizer = Tokenizer(WordPiece())
        # tokenizer.pre_tokenizer = Whitespace
        trainer = WordPieceTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )

    tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func))

    tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab")
    test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어"
    output = tokenizer.encode(test_string)
    print(f"output:{output}")
    print(f"tokens:{output.tokens}")
    print(f"ids   :{output.ids}")
    print(f"offset:{output.offsets}")
    print(f"decode:{tokenizer.decode(output.ids)}")

    datasets = get_datasets(args.tokenizer.data_path)

    for line in datasets:
        print(line)
        break
Beispiel #2
0
def train_tokenizer(input_dir: str,
                    save_path: str,
                    tokenizer_type: str = "BPE",
                    vocab_size: int = 52000):
    """
    Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`

    :param input_dir: input directory containing jsonl files
    :param save_path: path to save tokenizer to
    :param tokenizer_type: type of tokenizer to train.
    :param vocab_size: int, size of tokenizer's vocab
    :return:
    """

    if tokenizer_type == "BPE":
        model = models.BPE()
    else:
        raise NotImplementedError(
            f'Tokenizer type {tokenizer_type} not implemented')
    tokenizer = Tokenizer(model)

    # Customize pre-tokenization and decoding
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
    tokenizer.normalizer = NFKC()

    # And then train
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"])
    tokenizer.train_from_iterator(json_iterator(input_dir), trainer)

    # And Save it
    tokenizer.save(save_path, pretty=True)
    print(f'Tokenizer saved at {save_path}')
Beispiel #3
0
def train():
    """Source: https://huggingface.co/docs/tokenizers/pipeline"""

    base = os.environ['DATA_ROOT']
    corpus_path = base + 'MimicIII/Encounters/Text/'

    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    # input to tokenizer.encode() goes through this pipeline:
    # normalization, pre-tokenization, model, post-processing
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[("[CLS]", 1), ("[SEP]", 2)])

    files = [str(file) for file in Path(corpus_path).glob('*.txt')]
    trainer = WordPieceTrainer(
        vocab_size=30522,
        show_progress=True,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    bert_tokenizer.train(files, trainer)

    os.mkdir('./Tokenizer')
    bert_tokenizer.save("Tokenizer/tokenizer.json")
Beispiel #4
0
def get_daily_dialog_tokenizer(tokenizer_location=None):
    '''
    Get the daily dialog tokenizer. Trains a new one if no location is provided
    :param tokenizer_location: Json containing information about the tokenizer.
    :return:
    '''
    if tokenizer_location:
        tokenizer = Tokenizer.from_file(tokenizer_location, )
        tokenizer.enable_padding()
        return tokenizer
    else:
        dataset_train = datasets.load_dataset("daily_dialog", split="train", )
        utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train]

        trainer = WordPieceTrainer(
            vocab_size = 2048, 
            special_tokens = token_utils.special_tokens.values()
        )

        custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], ))
        custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
        custom_tokenizer.pre_tokenizer = Whitespace()
        custom_tokenizer.train_from_iterator(utterances, trainer, )
        custom_tokenizer.enable_padding()

        # Write every dialogue to file
        location = './daily_dialog/'
        custom_tokenizer.save(location + "tokenizer.json")

        return custom_tokenizer
Beispiel #5
0
def generate_tokenizer(equations, output, vocab_size):
    from tokenizers import Tokenizer, pre_tokenizers
    from tokenizers.models import BPE
    from tokenizers.trainers import BpeTrainer
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    trainer = BpeTrainer(special_tokens=["[PAD]", "[BOS]", "[EOS]"], vocab_size=vocab_size, show_progress=True)
    tokenizer.train(trainer, equations)
    tokenizer.save(path=output, pretty=False)
Beispiel #6
0
def tokenize(dt, df):
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece
    from tokenizers.pre_tokenizers import Whitespace
    from tokenizers import normalizers
    from tokenizers.normalizers import NFD, StripAccents
    from tokenizers.processors import TemplateProcessing
    from tokenizers.trainers import WordPieceTrainer

    #print(df.head())
    #print(df.query_text.head())
    #print(df.query_text.to_list())
    #exit(0)
    data_source = get_data_source(dt)
    token_file = Path(data_dir, data_source, 'tokenizer.json')
    vocab_file = Path(data_dir, data_source, 'vocab.txt')
    corpus_file = Path(data_dir, data_source, 'corpus.txt')
    if vocab_file.is_file() and corpus_file.is_file():
        print("corpus and token files already generated")
        return 0

    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=25000,
        min_frequency=3,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    #print(df.query_text.to_list())
    bert_tokenizer.train_from_iterator(df.query_text.to_list(), trainer)
    bert_tokenizer.save(str(token_file))
    #bert_tokenizer.save_model(directory=data_dir,name='tokenizer')
    df['range_idx'] = range(0, df.shape[0])
    df['mean_rank_group'] = df.groupby(
        ['session_id'], sort=False)['range_idx'].transform(np.mean)
    df['separate_column'] = df['range_idx'] < df['mean_rank_group']
    df = df.groupby(['session_id', 'separate_column'],
                    as_index=False,
                    sort=False)['query_text'].agg(
                        ' '.join).drop(columns='separate_column')
    #df = df.groupby('session_id').agg({'query_text':' '.join}).reset_index()
    df.query_text.to_csv(corpus_file, header=False, index=False)
    with open(token_file) as token_f:
        jdata = json.load(token_f)
        with open(vocab_file, "w") as fd:
            for k in jdata['model']['vocab'].keys():
                print(k, file=fd)
Beispiel #7
0
    def test_continuing_prefix_trainer_mistmatch(self):
        UNK = "[UNK]"
        special_tokens = [UNK]
        tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
        trainer = trainers.BpeTrainer(special_tokens=special_tokens)
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)]
        )
        tokenizer.train(files=["data/big.txt"], trainer=trainer)

        tokenizer.save("data/tokenizer.json")

        tokenizer.from_file("data/tokenizer.json")
Beispiel #8
0
def main(args):
    # from tokenizers import BertWordPieceTokenizer
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece

    bert_tokenizer = Tokenizer(WordPiece())
    # bert_tokenizer = Tokenizer(MBartTokenizer())

    from tokenizers import normalizers

    from tokenizers.normalizers import Lowercase, NFD, StripAccents

    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace

    bert_tokenizer.pre_tokenizer = Whitespace()

    # from tokenizers.processors import TemplateProcessing
    #
    # bert_tokenizer.post_processor = TemplateProcessing(
    #     single="[CLS] $A [SEP]",
    #     pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    #     special_tokens=[
    #         ("[CLS]", 1),
    #         ("[SEP]", 2),
    #     ],
    # )

    from tokenizers.trainers import WordPieceTrainer

    trainer = WordPieceTrainer(
        vocab_size=10000,
        special_tokens=["[UNK]", "[CLS]", "[PAD]",
                        "[MASK]"]  # "[SEP]", "[PAD]", "[MASK]"]
    )
    files = glob.glob(args.text_raw_files_pattern)
    bert_tokenizer.train(trainer, files)

    os.makedirs(args.output_dir, exist_ok=True)
    model_files = bert_tokenizer.model.save(args.output_dir,
                                            "bert-tokenizer-kr")
    bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]")

    bert_tokenizer.save(os.path.join(args.output_dir,
                                     "bert-tokenizer-kr.json"))
def main(args):
    # copy from https://github.com/xinjli/allosaurus
    ipa0 = [
        'I', 'a', 'aː', 'ã', 'ă', 'b', 'bʲ', 'bʲj', 'bʷ', 'bʼ', 'bː', 'b̞',
        'b̤', 'b̥', 'c', 'd', 'dʒ', 'dʲ', 'dː', 'd̚', 'd̥', 'd̪', 'd̯', 'd͡z',
        'd͡ʑ', 'd͡ʒ', 'd͡ʒː', 'd͡ʒ̤', 'e', 'eː', 'e̞', 'f', 'fʲ', 'fʷ', 'fː',
        'g', 'gʲ', 'gʲj', 'gʷ', 'gː', 'h', 'hʷ', 'i', 'ij', 'iː', 'i̞', 'i̥',
        'i̯', 'j', 'k', 'kx', 'kʰ', 'kʲ', 'kʲj', 'kʷ', 'kʷʼ', 'kʼ', 'kː',
        'k̟ʲ', 'k̟̚', 'k͡p̚', 'l', 'lʲ', 'lː', 'l̪', 'm', 'mʲ', 'mʲj', 'mʷ',
        'mː', 'n', 'nj', 'nʲ', 'nː', 'n̪', 'n̺', 'o', 'oː', 'o̞', 'o̥', 'p',
        'pf', 'pʰ', 'pʲ', 'pʲj', 'pʷ', 'pʷʼ', 'pʼ', 'pː', 'p̚', 'q', 'r', 'rː',
        's', 'sʲ', 'sʼ', 'sː', 's̪', 't', 'ts', 'tsʰ', 'tɕ', 'tɕʰ', 'tʂ',
        'tʂʰ', 'tʃ', 'tʰ', 'tʲ', 'tʷʼ', 'tʼ', 'tː', 't̚', 't̪', 't̪ʰ', 't̪̚',
        't͡s', 't͡sʼ', 't͡ɕ', 't͡ɬ', 't͡ʃ', 't͡ʃʲ', 't͡ʃʼ', 't͡ʃː', 'u', 'uə',
        'uː', 'u͡w', 'v', 'vʲ', 'vʷ', 'vː', 'v̞', 'v̞ʲ', 'w', 'x', 'x̟ʲ', 'y',
        'z', 'zj', 'zʲ', 'z̪', 'ä', 'æ', 'ç', 'çj', 'ð', 'ø', 'ŋ', 'ŋ̟', 'ŋ͡m',
        'œ', 'œ̃', 'ɐ', 'ɐ̞', 'ɑ', 'ɑ̱', 'ɒ', 'ɓ', 'ɔ', 'ɔ̃', 'ɕ', 'ɕː', 'ɖ̤',
        'ɗ', 'ə', 'ɛ', 'ɛ̃', 'ɟ', 'ɡ', 'ɡʲ', 'ɡ̤', 'ɡ̥', 'ɣ', 'ɣj', 'ɤ', 'ɤɐ̞',
        'ɤ̆', 'ɥ', 'ɦ', 'ɨ', 'ɪ', 'ɫ', 'ɯ', 'ɯ̟', 'ɯ̥', 'ɰ', 'ɱ', 'ɲ', 'ɳ',
        'ɴ', 'ɵ', 'ɸ', 'ɹ', 'ɹ̩', 'ɻ', 'ɻ̩', 'ɽ', 'ɾ', 'ɾj', 'ɾʲ', 'ɾ̠', 'ʀ',
        'ʁ', 'ʁ̝', 'ʂ', 'ʃ', 'ʃʲː', 'ʃ͡ɣ', 'ʈ', 'ʉ̞', 'ʊ', 'ʋ', 'ʋʲ', 'ʌ', 'ʎ',
        'ʏ', 'ʐ', 'ʑ', 'ʒ', 'ʒ͡ɣ', 'ʔ', 'ʝ', 'ː', 'β', 'β̞', 'θ', 'χ', 'ә', 'ḁ'
    ]
    ipa1, ipa2, ipa3 = ipa0.copy(), ipa0.copy(), ipa0.copy()
    random.shuffle(ipa1)
    random.shuffle(ipa2)
    random.shuffle(ipa3)
    # randomly joined to form training data
    passage0 = ' '.join(ipa0)
    passage1 = ' '.join(ipa1)
    passage2 = ' '.join(ipa2)
    passage3 = ' '.join(ipa3)
    data = [passage0, passage1, passage2, passage3]
    # setup
    tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
    # trainer = WordLevelTrainer(vocab_size=300, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    trainer = WordLevelTrainer(
        vocab_size=300,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    tokenizer.pre_tokenizer = Whitespace()
    # train the tokenizer
    tokenizer.train_from_iterator(data, trainer=trainer)
    tokenizer.save(args.outdir + '/ipa_tokenizer.json')
def create_train_bpe_tokenizer(
        bpe_vocab_size,
        asr_text_filepath='asr.txt',
        ttx_text_filepath='ttx.txt',
        save_tokenizer=True,
        tokenizer_filename=".\\data\\tokenizer-test.json"):
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    trainer = BpeTrainer(
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
        vocab_size=bpe_vocab_size)
    tokenizer.pre_tokenizer = Whitespace()
    files = [asr_text_filepath, ttx_text_filepath]
    files = [file for file in files if file]  # Get rid of None's
    tokenizer.train(files, trainer)

    if save_tokenizer:
        tokenizer.save(tokenizer_filename)

    return tokenizer
Beispiel #11
0
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer:
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer
Beispiel #12
0
def train_wordpiece_bert():
    """
    Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
    """
    from tokenizers.models import WordPiece
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    from tokenizers import normalizers
    from tokenizers.normalizers import Lowercase, NFD, StripAccents
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace
    bert_tokenizer.pre_tokenizer = Whitespace()

    from tokenizers.processors import TemplateProcessing
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )

    bert_tokenizer.decoder = decoders.WordPiece()

    from tokenizers.trainers import WordPieceTrainer
    trainer = WordPieceTrainer(
        vocab_size=30522,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    files = [
        DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a
        for a in ["test", "train", "valid"]
    ]
    bert_tokenizer.train(files, trainer)
    bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json')

    return bert_tokenizer
Beispiel #13
0
def build_new_vocab():

    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

    trainer = BpeTrainer(
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    tokenizer.pre_tokenizer = Whitespace()

    # files = [f"/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-{split}-factoid-7b.json" for split in ["train_split", "dev"]]
    files = "/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-train-factoid-7b.json"

    with open(files) as f:
        file = json.load(f)
    contexts = []
    for question in file['data']:
        for paragraph in question['paragraphs']:
            contexts.append(paragraph['context'])

    tokenizer.train_from_iterator(contexts, trainer)
    additional_vocab = [k for k, v in tokenizer.get_vocab().items()]

    tokenizer.save("tokenizer/tokenizer-bioasq.json")
    return additional_vocab
def main(args):
    if args.do_train:
        # Initialize a tokenizer
        files = get_smi_files(args.training_files)
        print("Training BPE tokenizer using the following files:{}".format(
            files))
        tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
        tokenizer.enable_padding(pad_id=args.vocab_size + 2,
                                 pad_token="<pad>",
                                 length=args.pad_len)
        tokenizer.enable_truncation(max_length=args.pad_len,
                                    strategy='only_first')
        tokenizer.normalizer = Sequence([NFKC()])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=False)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
        # Train the tokenizer
        trainer = trainers.BpeTrainer(show_progress=True,
                                      vocab_size=args.vocab_size,
                                      min_frequency=args.min_frequency)
        tokenizer.train(files, trainer=trainer)
        tokenizer.add_tokens(["<start>", "<end>"])
        tokenizer.save(os.path.join('tokenizers', args.tokenizer_name),
                       pretty=True)
        print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

    if args.do_test:
        # Test the tokenizer
        tokenizer = Tokenizer.from_file(
            os.path.join('tokenizers', args.tokenizer_name))
        print("Testing with SMILES String: {}".format(args.test_string))
        encoding = tokenizer.encode(args.test_string)
        print("Encoded string: {}".format(encoding.tokens))
        print(encoding.ids)
        decoded = tokenizer.decode(encoding.ids)
        print("Decoded string: {}".format(decoded))
Beispiel #15
0
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, NFD, Lowercase, Strip


def train(dataset_path,
          output_dir='data/tokenizer/',
          vocab_size=30_000,
          min_frequency=3):

    trainer = WordPieceTrainer(vocab_size=vocab_size,
                               min_frequency=min_frequency,
                               special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'])
    tokenizer = Tokenizer(WordPiece())
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), Strip()])

    files = [dataset_path]
    tokenizer.train(trainer, files)

    files = tokenizer.model.save(output_dir)
    tokenizer.model = WordPiece.from_file(*files, unk_token='[UNK]')

    tokenizer.save(f'{output_dir}tokenizer.json')


if __name__ == '__main__':
    fire.Fire(train)
Beispiel #16
0
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
#from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
#tokenizer.pre_tokenizer = Whitespace()
files = ['./processed/processed_wiki_ko.txt']
tokenizer.train(files, trainer)

tokenizer.save("wiki_tokenizer.json")
    def __init__(
        self,
        target_vocab,
    ):
        special_tokens = {
            "pad_token": "[PAD]",
            "unk_token": "[UNK]",
            "sep_token": "[SEP]",
            "cls_token": "[CLS]",
            "mask_token": "[MASK]",
        }

        vocab = {}
        vocab[special_tokens["pad_token"]] = 0

        tkn_idx = 1
        unused_ctr = 0

        # not sure whether that's relevant, but fill 1..99  and 105...999
        # with unused tokens to keep BERT's tokenizer style
        # as a result, one can easily identify special tokens:
        # 0 is padding
        # 1xx are other special tokens
        # any four-digit tokens are actual payload
        fill_tokens = False

        if(fill_tokens):
            while(tkn_idx < 100):
                vocab[f"[unused{unused_ctr}]"] = tkn_idx
                tkn_idx += 1
                unused_ctr += 1

        for token in ["unk_token", "cls_token", "sep_token", "mask_token"]:
            vocab[special_tokens[token]] = tkn_idx
            tkn_idx += 1

        if(fill_tokens):
            while(tkn_idx < 1000):
                vocab[f"[unused{unused_ctr}]"] = tkn_idx
                tkn_idx += 1
                unused_ctr += 1

        for word in target_vocab:
            vocab[word] = tkn_idx
            tkn_idx += 1

        tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token=special_tokens["unk_token"]))
        tokenizer.add_special_tokens(list(special_tokens.values()))
        tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

        sep_token_id = tokenizer.token_to_id(special_tokens["sep_token"])
        cls_token_id = tokenizer.token_to_id(special_tokens["cls_token"])

        tokenizer.post_processor = processors.BertProcessing(
            (special_tokens["sep_token"], sep_token_id), (special_tokens["cls_token"], cls_token_id)
        )

        parameters = special_tokens
        parameters["model"] = "WordLevel"

        super().__init__(tokenizer, parameters)

        tokenizer.save(PRETRAINED_TOKENIZER_FILE)
OUTPUT_PATH = 'data/tokenizers/'

# Prepare data
train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name'])
item_names = train.item_name.drop_duplicates().tolist()

# WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=70000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json'))

# BPE tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=60000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'bpe_60k.json'))

# Unigram tokenizer
tokenizer = Tokenizer(Unigram())
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
        dp = json.loads(line.strip())
        for d in enumerate(dp):
            if "value" in d:
                if "," in d["value"]:
                    print('Not cleaned up')

# Extract value/types from trees and store in comma separated raw file (all_raw.json)

with open("output/all_new_trees.json") as fin, open("output/all_raw.json",
                                                    "w") as fout:
    for i, line in enumerate(tqdm(fin)):
        dp = json.loads(line)
        token_list = []
        for d in dp:
            if "value" in d:
                token_list.append(d["value"])
            elif "type" in d:
                token_list.append(d["type"])
        raw = ",".join(token_list)
        print(json.dumps(raw), file=fout)

# Train tokenizer on raw file

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter=",")
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[PAD]"])

tokenizer.train(["output/all_raw.json"], trainer)

tokenizer.save("output/tokenizer.json")
 def test_instantiation_from_tokenizers_json_file(self):
     bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
     with tempfile.TemporaryDirectory() as tmpdirname:
         bert_tokenizer.save(os.path.join(tmpdirname, "tokenizer.json"))
         PreTrainedTokenizerFast(
             tokenizer_file=os.path.join(tmpdirname, "tokenizer.json"))
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
from tokenizers.processors import TemplateProcessing

t = Tokenizer(WordLevel(unk_token="[UNK]"))
t.pre_tokenizer = Whitespace()

trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"])
t.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    # ,
    # pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 2),
        ("[SEP]", 3),
    ])

files = ['tok-train-shuf-tgt.tsv']
t.train(files, trainer)

t.save("code_tokenizer.json")
Beispiel #22
0
text_tokenizer.load_vocab(vocab_path)

vocab = text_tokenizer.vocab
vocab_count = len(vocab.keys())
vocab.update({'<|endoftext|>': vocab_count})

tokenizer_tmp = Tokenizer(WordLevel(text_tokenizer.vocab))
tokenizer_tmp.pre_tokenizer = CharDelimiterSplit(' ')

tokenizer_tmp.post_processor = BertProcessing(
    ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")),
    ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")),
)

tokenizer_tmp_path.mkdir(parents=True, exist_ok=True)
tokenizer_tmp.save(str(tokenizer_tmp_path / "tokenizer.json"))

# Re-create as GPT2 compatible tokenizer


class GPT2CompatibleTokenizer(PreTrainedTokenizerFast):
    def save_vocabulary(self,
                        save_directory: str,
                        filename_prefix: Optional[str] = None) -> Tuple[str]:
        file = str(tokenizer_path / "tokenizer.json")
        tokenizer.backend_tokenizer.save(file)
        files = [file]
        return tuple(files)

    def save_pretrained(
        self,
# Roberta LM colator with stemmed text

# 2. Create a tokenizer
# load our tokenizer
text_tokenizer = TextTokenizer(dataset_path)
text_tokenizer.load_vocab(dataset_path / 'vocab.json')

# Create transformers compatible tokenizer
tokenizer = Tokenizer(WordLevel(text_tokenizer.vocab))
tokenizer.pre_tokenizer = CharDelimiterSplit(' ')
tokenizer.model.unk_token = '<unk>'

tokenizer_path = dataset_path / 'tokenizer1'
tokenizer_path.mkdir(parents=True, exist_ok=True)
tokenizer.save(str(tokenizer_path / "tokenizer.json"))

# Re-create as roberta compatible tokenizer
tokenizer_path = dataset_path / 'tokenizer1'
print(tokenizer_path)

tokenizer2 = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_path / "tokenizer.json"))
tokenizer2._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer2._tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer2._tokenizer.token_to_id("<s>")),
)
tokenizer2._tokenizer.enable_truncation(max_length=128)  # 512
tokenizer2.mask_token = "<mask>"
tokenizer2.pad_token = "<pad>"

# 3. Train a language model
Beispiel #24
0
                  for s in g:
                      f.write(s)
                      f.write("\n\n")
          elif args.file_type == 'txt':
              shutil.copyfile(str(arch), str(fp))

  data_files = glob(str(out_path / "*.txt"))
  data_files = random.sample(data_files, int(0.2 * len(data_files)))

  assert len(data_files) > 0, 'No data files found'

  # Initialize a tokenizer
  tokenizer = Tokenizer(models.BPE())

  # Customize pre-tokenization and decoding
  tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
  tokenizer.decoder = decoders.ByteLevel()
  tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
  tokenizer.normalizer = NFKC()

  # And then train
  trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"])
  tokenizer.train(trainer, data_files)

  # And Save it
  tokenizer_path = out_path / "byte-level-bpe.tokenizer.json"
  tokenizer.save(str(tokenizer_path), pretty=True)

  print(f'tokenizer saved at {str(tokenizer_path)}')
  return tokenizer_path
        # splitting our inputs into words
        tokenizer.pre_tokenizer = Whitespace()

        # instantiate trainer
        trainer = BpeTrainer(
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
            min_frequency=2)

        # get files
        files = [os.path.join(args.data_dir, f"{f}-sentences.txt")]

        # train tokenizer
        tokenizer.train(files=files, trainer=trainer)

        # save tokenizer config file
        tokenizer.save(os.path.join(args.save_dir, f"tokenizer-{f}.json"))

    # load trained tokenizers
    for f in ['ewe-fon', "ewe", "fon"]:
        print(f'Using {f} tokenizer : \n')
        try:
            tokenizer = Tokenizer.from_file(
                os.path.join(args.save_dir, f"tokenizer-{f}.json"))
            output = tokenizer.encode(
                "Gbadanu tɛgbɛ ɔ, Noah tuun ɖɔ e nɔ cɛ emi")
            print(output.tokens)
            print(output.ids)
            print(output.offsets[9])
        except Exception as ex:
            print(ex)
Beispiel #26
0
def train_custom_tokenizer(dataset,
                           token_model,
                           tknzr_file,
                           vocab_size,
                           vocab=None,
                           pretrain_fast=False,
                           max_input_chars_per_word=None,
                           eos_token=None,
                           bos_token=None,
                           pad_token=None,
                           mask_token=None,
                           unk_token=None):
    """
    Building a Tokenizer using HuggingFace library. The pipeline seems to be:

        - Model           : algorithm that tokenizes, it is a mandatory
                            component. There are only 4 models implemented
                            (BPE, Unigram, WordLevel, WordPiece)
        - Normalizer      : some preprocessing that could happen before, but
                            doesn't necessarily have to
        - Pre-Tokenizer   : splitting the input according to some rules
        - Post-Processing : needing to add some tokens/input after (mostly seems
                            to be eos, bos tokens)
        - Decoder         : certain previous pipeline steps need to be reversed
                            for proper decoding
        - Trainer         : The corresponding training algorithm for the model

    Note : Some pre-processing might need to happen beforehand in previous
            functions (might be easier using pandas before)

    Input
        token_model (str)        : algorithm to use for tokenization
        dataset (class)          : a python iterator that goes through the data
                                    to be used for training
        token_dir (str)          : directory with tokenizers
        vocab_size (int)         : size of the vocabulary to use
        tokenFilename (str)     : filename of particular token we want to
                                    train. Will overwrite previously save files.
        vocab (list of str)      : models other than BPE can use non-mandatory
                                    vocab as input
        max_input_chars_per_word : used for WordPiece

    Output
        tokenizer                : huggingFace Tokenizer object, our fully
                                    trainer tokenizer

    """
    special_token_lst = [
        pad_token, bos_token, eos_token, mask_token, unk_token
    ]

    # NFKC
    normalizer_lst = []
    pre_tokenizer_lst = [Whitespace, ByteLevel]
    decoder_lst = []

    bos_idx = special_token_lst.index(bos_token)
    eos_idx = special_token_lst.index(eos_token)

    if token_model == 'BPE':
        model = BPE(unk_token=unk_token)
        Trainer = BpeTrainer
    elif token_model == 'Unigram':
        model = Unigram(vocab=vocab)
        Trainer = UnigramTrainer
    elif token_model == 'WordLevel':
        model = WordLevel(unk_token=unk_token, vocab=vocab)
        Trainer = WordLevelTrainer
    elif token_model == 'WordPiece':
        model = WordPiece(unk_token=unk_token,
                          vocab=vocab,
                          max_input_chars_per_word=max_input_chars_per_word)
        Trainer = WordPieceTrainer
    else:
        error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \
                    % VALID_TOKENIZATIONS
        raise SystemExit(error_msg)

    # instantiation
    tokenizer = Tokenizer(model)

    # Select a tokenization trainer
    if vocab_size is None:
        trainer = Trainer(show_progress=True, special_tokens=special_token_lst)
    else:
        trainer = Trainer(vocab_size=vocab_size,
                          show_progress=True,
                          special_tokens=special_token_lst)

    # Set the normalizer
    tokenizer.normalizer = normalizers.Sequence(
        [fcn() for fcn in normalizer_lst])

    # Set the pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
        [fcn() for fcn in pre_tokenizer_lst])

    # Set the post-processing
    tokenizer.post_processor = processors.TemplateProcessing(
        single=bos_token + " $A " + eos_token,
        special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)],
        #  pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1",
    )

    # Set the decoder
    if ByteLevel in pre_tokenizer_lst:
        tokenizer.decoder = decoders.ByteLevel()
    if Metaspace in pre_tokenizer_lst:
        tokenizer.decoder = decoders.Metaspace()
    if token_model == 'WordPiece':
        tokenizer.decoder = decoders.WordPiece()

    # creating iterator
    def batch_iterator():
        for i in np.arange(0, len(dataset)):
            yield dataset[i]

    # train call
    tokenizer.train_from_iterator(trainer=trainer,
                                  iterator=batch_iterator(),
                                  length=len(dataset))

    if Path(tknzr_file).exists():
        print(f"Warning : overwriting previously save tokenizer with\
                        same filename ( {tknzr_file} ).")
    tokenizer.save(tknzr_file)

    if pretrain_fast:
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file)
    else:
        tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file)
    tokenizer.pad_token = pad_token
    tokenizer.mask_token = mask_token

    return tokenizer
Beispiel #27
0
# TODO: Use an LSTM to train on sequences, then freeze early layers and add
# classification backend, retrain.

# https://github.com/huggingface/tokenizers/tree/master/bindings/python
# https://github.com/huggingface/tokenizers/tree/master/bindings/python#provided-tokenizers
from tokenizers import CharBPETokenizer

tokenizer = CharBPETokenizer(bert_normalizer=False)
tokenizer.train(['./bar'], vocab_size=1000, min_frequency=2)
# tokenizer.encode(seq).tokens

encoded = tokenizer.encode(seq)
a0 = encoded.ids
a1 = encoded.tokens

tokenizer.save('.', 'mytoken3')
# ['./mytoken-vocab.json', './mytoken-merges.txt']

from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

# Load a BPE Model
vocab = './mytoken3-vocab.json'
merges = './mytoken3-merges.txt'
bpe = CharBPETokenizer(vocab, merges)

# Initialize a tokenizer
encoded = tokenizer.encode(seq)
b0 = encoded.ids
b1 = encoded.tokens

assert a0 == b0
Beispiel #28
0
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer



ids = bert_tokenizer.encode(sentences[10]).ids
bert_tokenizer.decode(ids)


from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers

tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoders = decoders.ByteLevel()

trainer = trainers.UnigramTrainer(
    vocab_size=20000,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=["<PAD>", "<BOS>", "<EOS>"],
)

tokenizer.train_from_iterator(sentences, trainer=trainer)
tokenizer.encode(sentences[4]).ids
tokenizer.decode(tokenizer.encode(sentences[4]).ids)
tokenizer.save('bert_out/test2')

tokenizer.save_pretrained('bert_out/test')
Beispiel #29
0
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True):
    """
    if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining)
    else: load the tokenizer if it exists
    """
    assert dataset in VALID_DATASETS
    assert style in VALID_TOKENIZATIONS

    tpath_expected = default_tpath(dataset, style)

    train = True
    if not force_retrain and os.path.isfile(tpath_expected):
        tokenizer = Tokenizer.from_file(tpath_expected)
        train = False
    else:
        print('%s tokenizer file does not exist; training new tokenizer' %
              tpath_expected)

    if train:

        # load data associated with one of the valid datasets (from /data/ directory)
        datafiles = load_dataset(dataset)

        # Steps for each algo (e.g. BPE):
        # - init Tokenizer using algo
        # - specify algo specific trainer
        # - specify any pre-processing of text (will affect decoding)
        #   see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders
        # - different training calls if its the arxiv dataset or wikitext
        #   see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

        if style == 'BPE':
            tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
            trainer = BpeTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = ByteLevel()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.ByteLevel()

        else:
            assert style == 'WordLevel'
            tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
            trainer = WordLevelTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = Whitespace()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.WordPiece(
            )  # WordPiece seems to work (adds back spaces)

        # Save to tokenizers directory
        tokenizer.save(tpath_expected)

    # Generate vocab object based on tokenizer.decoder() method
    # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere...
    # Features we need to match:
    #   from torchtext.legacy.vocab import Vocab as RetiredVocab
    #   ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...)
    #   data = [torch.tensor([vocab[token] for token in tokenizer(item)],
    #                         dtype=torch.long) for item in raw_text_iter]
    #   tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long)
    #   running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])])
    #   unk_index = vocab.unk_index
    vocab = None

    return tokenizer, vocab
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import WordLevel

VOCAB_FILE = "data/tx1_vocab.txt"

with open(VOCAB_FILE, "r") as f:
    words = list(set(f.read().strip().split("\n")))

vocab = {}
for i, word in enumerate(["<pad>", "<unk>"] + words):
    vocab[word] = i

tokenizer = Tokenizer(WordLevel(vocab, unk_token="<unk>"))
tokenizer.enable_padding(pad_token="<pad>")
tokenizer.pre_tokenizer = Whitespace()

tokenizer.save("data/tokenizer-LakhNES-tx1.json")