def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        tokenize_chinese_chars = False
        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id

        tokenizer.post_processor = processors.TemplateProcessing(
            single=
            f"{cls}:2 $A:0 {sep}:0",  # token_type_id is 2 for Funnel transformer
            pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1",
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
    def converted(self) -> Tokenizer:
        from .models.roformer.tokenization_utils import JiebaPreTokenizer

        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=False,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            JiebaPreTokenizer(vocab))

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id

        tokenizer.post_processor = processors.TemplateProcessing(
            single=f"{cls}:0 $A:0 {sep}:0",
            pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1",
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
Beispiel #3
0
    def train_new_from_iterator(self,
                                text_iterator,
                                s_vocab,
                                new_special_tokens=None,
                                special_tokens_map=None,
                                **kw):
        tokenizer_json = json.loads(self._tokenizer.to_str())
        # Remove added tokens for now (uses IDs of tokens)
        added_tokens = tokenizer_json.pop("added_tokens")
        # Remove post processor for now (uses IDs of tokens)
        post_processor = tokenizer_json.pop("post_processor")

        unk = None
        # Remove vocab
        if tokenizer_json["model"]["type"] == "BPE":
            tokenizer_json["model"]["vocab"] = {}
            tokenizer_json["model"]["merges"] = []
        elif tokenizer_json["model"]["type"] == "Unigram":
            if tokenizer_json["model"]["unk_id"] is not None:
                unk_id = tokenizer_json["model"]["unk_id"]
                unk = tokenizer_json["model"]["vocab"][unk_id][0]
                if special_tokens_map is not None and unk in special_tokens_map:
                    unk = special_tokens_map[unk]
                tokenizer_json["model"]["unk_id"] = 0
                tokenizer_json["model"]["vocab"] = [[unk, 0.0]]
        elif tokenizer_json["model"]["type"] in ["WordLevel", "WordPiece"]:
            tokenizer_json["model"]["vocab"] = {}
        else:
            raise ValueError(
                f"This method does not support this type of tokenizer (found {tokenizer_json['model']['type']}) "
                "only BPE, Unigram, WordLevel and WordPiece.")

        if (special_tokens_map is not None and "unk" in tokenizer_json["model"]
                and tokenizer_json["model"]["unk"] in special_tokens_map):
            tokenizer_json["model"]["unk"] = special_tokens_map[
                tokenizer_json["model"]["unk"]]

        tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json))

        # Get the special tokens from the current tokenizer if none are specified.
        special_tokens = []
        for added_token in added_tokens:
            special = added_token.pop("special", None)
            _ = added_token.pop("id", None)
            if tokenizer_json["model"]["type"] != "Unigram" and not special:
                continue
            if special_tokens_map is not None and added_token[
                    "content"] in special_tokens_map:
                added_token["content"] = special_tokens_map[
                    added_token["content"]]
            special_tokens.append(AddedToken(**added_token))

        if new_special_tokens is not None:
            special_tokens.extend(new_special_tokens)

        # Trainer needs to know the end of word / continuing subword thingies in BPE
        if (tokenizer_json["model"]["type"] == "BPE"
                and "continuing_subword_prefix" not in kw
                and tokenizer_json["model"]["continuing_subword_prefix"]
                is not None):
            kw["continuing_subword_prefix"] = tokenizer_json["model"][
                "continuing_subword_prefix"]
        if (tokenizer_json["model"]["type"] == "BPE"
                and "end_of_word_suffix" not in kw
                and tokenizer_json["model"]["end_of_word_suffix"] is not None):
            kw["end_of_word_suffix"] = tokenizer_json["model"][
                "end_of_word_suffix"]
        if tokenizer_json["model"]["type"] == "Unigram" and unk is not None:
            kw["unk"] = unk

        trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]
                                                 ["type"]]
        trainer = trainer_class(s_vocab=s_vocab,
                                special_tokens=special_tokens,
                                **kw)
        tokenizer.train_from_iterator(text_iterator, trainer=trainer)

        if post_processor is not None:
            trained_tokenizer_json = json.loads(tokenizer.to_str())
            # Almost done, we just have to adjust the token IDs in the post processor
            if "special_tokens" in post_processor:
                for key in post_processor["special_tokens"]:
                    tokens = post_processor["special_tokens"][key]["tokens"]
                    if special_tokens_map is not None:
                        tokens = [
                            special_tokens_map.get(token, token)
                            for token in tokens
                        ]
                    post_processor["special_tokens"][key]["tokens"] = tokens
                    post_processor["special_tokens"][key]["ids"] = [
                        tokenizer.token_to_id(token) for token in tokens
                    ]

            for special_token in ["cls", "sep"]:
                if special_token in post_processor:
                    token, _ = post_processor[special_token]
                    if special_tokens_map is not None and token in special_tokens_map:
                        token = special_tokens_map[token]
                    token_id = tokenizer.token_to_id(token)
                    post_processor[special_token] = [token, token_id]

            trained_tokenizer_json["post_processor"] = post_processor
            tokenizer = TokenizerFast.from_str(
                json.dumps(trained_tokenizer_json))

        kw = self.init_kw.copy()
        # Map pad/cls/mask token at the Transformers level
        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy(
        )
        special_tokens_list.remove("additional_special_tokens")
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
            if getattr(self, f"_{token}") is not None:
                special_token = getattr(self, token)
                if special_tokens_map is not None and special_token in special_tokens_map:
                    special_token = special_tokens_map[special_token]

                special_token_full = getattr(self, f"_{token}")
                if isinstance(special_token_full, AddedToken):
                    # Create an added token with the same parameters except the content
                    kw[token] = AddedToken(
                        special_token,
                        single_word=special_token_full.single_word,
                        lstrip=special_token_full.lstrip,
                        rstrip=special_token_full.rstrip,
                        normalized=special_token_full.normalized,
                    )
                else:
                    kw[token] = special_token

        additional_special_tokens = self.additional_special_tokens
        if new_special_tokens is not None:
            additional_special_tokens.extend(new_special_tokens)
        if len(additional_special_tokens) > 0:
            kw["additional_special_tokens"] = additional_special_tokens

        return self.__class__(tokenizer_object=tokenizer, **kw)
 def test_from_pretrained(self):
     tokenizer = Tokenizer.from_pretrained("bert-base-cased")
     output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
     assert output.tokens == ["Hey", "there", "dear", "friend", "!"]
    def test_post_process(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.enable_truncation(2)
        tokenizer.enable_padding(length=4)

        encoding = tokenizer.encode("my name is john")
        pair_encoding = tokenizer.encode("pair")

        # Can post process a single encoding
        output = tokenizer.post_process(encoding)
        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]

        # Can post process a pair of encodings
        output = tokenizer.post_process(encoding, pair_encoding)
        assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
    def test_truncation(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.enable_truncation(2)

        # Can truncate single sequences
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["my", "name"]

        # Can truncate pair sequences as well
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["my", "pair"]

        # Can get the params and give them to enable_truncation
        trunc = tokenizer.truncation
        tokenizer.enable_truncation(**trunc)

        # Left truncation direction
        tokenizer.enable_truncation(2, direction="left")
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["is", "john"]

        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["john", "pair"]
Beispiel #7
0
concepts = set([c.lower() for i in data for c in i['concept'].split("|")])
print(f"Unique concepts: {len(concepts)}\n")

# name = mention + concept
names = unique([mentions + list(concepts)], verbose=False) - stop_words

# names = unique([mentions + list(concepts)], verbose=False)
print(f"Unique names: {len(names)}\n")

name_words = {n: " ".join(split_to_words(n)) for n in names}

with open(f"{proc_path}/names.txt", "w") as f:
    f.write("\n".join(list(name_words.values())))
    # f.write("\n".join(words))

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    # NFKC(),
    Lowercase()
])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True)
tokenizer.train(trainer, [f"{proc_path}/names.txt"])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

tokenizer.model.save(proc_path)

tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json',
                                f'{proc_path}/merges.txt')
Beispiel #8
0
def fetch_encoder(config: EncoderConfig):
    if config.is_pretrained:
        return GPT2TokenizerFast.from_pretrained(config.location)

    return Tokenizer.from_file(config.location)
Beispiel #9
0
 def test_full_serialization_albert(self, albert_base):
     # Check we can read this file.
     # This used to fail because of BufReader that would fail because the
     # file exceeds the buffer capacity
     tokenizer = Tokenizer.from_file(albert_base)
    for line in tqdm(fin):
        dp = json.loads(line.strip())
        for d in enumerate(dp):
            if "value" in d:
                if "," in d["value"]:
                    print('Not cleaned up')

# Extract value/types from trees and store in comma separated raw file (all_raw.json)

with open("output/all_new_trees.json") as fin, open("output/all_raw.json",
                                                    "w") as fout:
    for i, line in enumerate(tqdm(fin)):
        dp = json.loads(line)
        token_list = []
        for d in dp:
            if "value" in d:
                token_list.append(d["value"])
            elif "type" in d:
                token_list.append(d["type"])
        raw = ",".join(token_list)
        print(json.dumps(raw), file=fout)

# Train tokenizer on raw file

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter=",")
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[PAD]"])

tokenizer.train(["output/all_raw.json"], trainer)

tokenizer.save("output/tokenizer.json")
Beispiel #11
0
import tensorflow as tf
import gpu_check
from preprocessing_data import create_training_data
from tokenizers import Tokenizer
from model_chatbot import seq2seq
from Hyper_parameter import (VOCAB_SIZE,
                    MAXLEN,
                    EPOCHS,
                    SAVE_AT,
                    LEARNING_RATE,
                    BATCH_SIZE,
                    VERBOSE,
                    LOSS)

tokenizer = Tokenizer()

encoder_input_data, decoder_input_data, decoder_output_data = create_training_data()  # parsing the dataset and creating conversation pairs

encoder_input_data, decoder_input_data, decoder_output_data = tokenizer.tokenize_and_pad_training_data(encoder_input_data,
                                                                                                        decoder_input_data,
                                                                                                        decoder_output_data)  # tokenizing and padding those pairs

tokenizer.save_tokenizer(f'tokenizer-vocab_size-{VOCAB_SIZE}')  # saving tokenizer for layer use

Seq2SeqModel = seq2seq()  # creating the seq2seq model

optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, clipnorm=1.0, clipvalue=0.5)
Seq2SeqModel.compile(optimizer=optimizer, loss=LOSS, metrics=['accuracy'])
Seq2SeqModel.summary()

def train(model, encoder_input_data, decoder_input_data, decoder_output_data, epochs, batch_size, verbose, save_at):
Beispiel #12
0
def main():
    parser = ArgumentParser()
    parser.add_argument('lang', choices=['nld', 'ita'])
    parser.add_argument('models', nargs='+')
    parser.add_argument('--src', default='small', choices=['full', 'small'])
    parser.add_argument('--file', default='full')
    parser.add_argument('-n', default=5, type=int)
    parser.add_argument('-f', '--force', action='store_true')
    args = parser.parse_args()

    base_path = Path(
        'data') / args.lang / 'evaluation' / 'examples' / args.src / args.file

    src_path = base_path / 'gold.txt'
    if not src_path.exists():
        print(f' > gold path {src_path} does not exist')
        exit(1)

    print(' > loading tokenizer')
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    if args.lang == 'ita':
        tokenizer = GPT2TokenizerFast.from_pretrained(
            'LorenzoDeMattei/GePpeTto')
    else:
        tokenizer_path = Path(
            'data') / args.lang / 'vocabularies' / 'tokenizer.json'
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
        args.n += 1

    print(f' > loading examples from {src_path}')
    examples = []
    with open(src_path) as f:
        for line in f:
            token_ids = tokenizer.encode(line.strip())
            if type(token_ids) != list:
                token_ids = [0] + token_ids.ids
            examples.append(token_ids[:args.n])
    print(f' > loaded {len(examples)} examples')

    for model_name in args.models:
        tgt_path = base_path / f'{model_name.replace("/", "_")}.txt'
        if not args.force and tgt_path.exists():
            print(f'{tgt_path} already exists. skipping')
            continue

        model_path = Path('data') / args.lang / 'models' / model_name
        if not model_path.exists():
            model_path = model_name

        print(f' > loading model {model_path}')
        model = GPT2LMHeadModel.from_pretrained(model_path).cuda()
        model.eval()

        print(' > generating endings for examples')
        generated = [
            generate(input_ids, model, tokenizer)
            for input_ids in tqdm(examples, ncols=80)
        ]
        with open(tgt_path, 'w') as f:
            f.writelines(generated)

        print(f'\nsaved to {tgt_path}')
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer

TRAIN_DATA_PATH = 'data/data_fusion_train.parquet'
OUTPUT_PATH = 'data/tokenizers/'

# Prepare data
train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name'])
item_names = train.item_name.drop_duplicates().tolist()

# WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=70000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json'))

# BPE tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = BpeTrainer(
Beispiel #14
0
from tokenizers import Tokenizer
import sys
import pickle
import numpy as np
from build_bpe import cleanup
import os

tokenizer = Tokenizer.from_file("bpe-fi.tokenizer.json")

print(tokenizer)
#dfolder = "../../Data/wiki/fi/"
dfolder = "../../Data/finovels/"
files = os.listdir(dfolder)

print("Read files from", dfolder)
print("...")
#s = open(dpath).read().lower()

lines = []

for dpath in files:
    with open(dfolder + dpath) as f:
        print("File:", dpath)

        for line in f:
            clean_line = cleanup(line)
            lines.append(clean_line)

#print("Encode", s[:100], len(s))
print("ENCODE")
encoded_l = tokenizer.encode_batch(lines)
Beispiel #15
0
import tensorflow as tf
import numpy as np
from tokenizers import ByteLevelBPETokenizer as Tokenizer
from transformers import RobertaConfig as Config
import re

PATH = 'roberta-base'
MAX_SEQUENCE_LENGTH = 192

TOKENIZER = Tokenizer(vocab_file="roberta/vocab.json",
                      merges_file="roberta/merges.txt",
                      lowercase=True,
                      add_prefix_space=True)


def preprocess(tweet, selected_text, sentiment, training=True):
    """
    Will be used in tf.data.Dataset.from_generator(...)

    """

    # The original strings have been converted to
    # byte strings, so we need to decode it
    tweet = tweet.decode('utf-8')
    selected_text = selected_text.decode('utf-8')
    sentiment = sentiment.decode('utf-8')

    # Clean up the strings a bit
    tweet = " ".join(str(tweet).split())
    selected_text = " ".join(str(selected_text).split())
Beispiel #16
0
    def train_new_from_iterator(
        self,
        text_iterator,
        vocab_size,
        length=None,
        new_special_tokens=None,
        special_tokens_map=None,
        **kwargs,
    ):
        """
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `List[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`Dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs:
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        """
        tokenizer_json = json.loads(self._tokenizer.to_str())
        # Remove added tokens for now (uses IDs of tokens)
        added_tokens = tokenizer_json.pop("added_tokens")
        # Remove post processor for now (uses IDs of tokens)
        post_processor = tokenizer_json.pop("post_processor")

        unk_token = None
        # Remove vocab
        if tokenizer_json["model"]["type"] == "BPE":
            tokenizer_json["model"]["vocab"] = {}
            tokenizer_json["model"]["merges"] = []
        elif tokenizer_json["model"]["type"] == "Unigram":
            if tokenizer_json["model"]["unk_id"] is not None:
                unk_id = tokenizer_json["model"]["unk_id"]
                unk_token = tokenizer_json["model"]["vocab"][unk_id][0]
                if special_tokens_map is not None and unk_token in special_tokens_map:
                    unk_token = special_tokens_map[unk_token]
                tokenizer_json["model"]["unk_id"] = 0
                tokenizer_json["model"]["vocab"] = [[unk_token, 0.0]]
        elif tokenizer_json["model"]["type"] in ["WordLevel", "WordPiece"]:
            tokenizer_json["model"]["vocab"] = {}
        else:
            raise ValueError(
                f"This method does not support this type of tokenizer (found {tokenizer_json['model']['type']}) "
                "only BPE, Unigram, WordLevel and WordPiece.")

        if (special_tokens_map is not None
                and "unk_token" in tokenizer_json["model"] and
                tokenizer_json["model"]["unk_token"] in special_tokens_map):
            tokenizer_json["model"]["unk_token"] = special_tokens_map[
                tokenizer_json["model"]["unk_token"]]

        tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json))

        # Get the special tokens from the current tokenizer if none are specified.
        special_tokens = []
        for added_token in added_tokens:
            special = added_token.pop("special", None)
            _ = added_token.pop("id", None)
            if tokenizer_json["model"]["type"] != "Unigram" and not special:
                continue
            if special_tokens_map is not None and added_token[
                    "content"] in special_tokens_map:
                added_token["content"] = special_tokens_map[
                    added_token["content"]]
            special_tokens.append(AddedToken(**added_token))

        if new_special_tokens is not None:
            special_tokens.extend(new_special_tokens)

        # Trainer needs to know the end of word / continuing subword thingies in BPE
        if (tokenizer_json["model"]["type"] == "BPE"
                and "continuing_subword_prefix" not in kwargs
                and tokenizer_json["model"]["continuing_subword_prefix"]
                is not None):
            kwargs["continuing_subword_prefix"] = tokenizer_json["model"][
                "continuing_subword_prefix"]
        if (tokenizer_json["model"]["type"] == "BPE"
                and "end_of_word_suffix" not in kwargs
                and tokenizer_json["model"]["end_of_word_suffix"] is not None):
            kwargs["end_of_word_suffix"] = tokenizer_json["model"][
                "end_of_word_suffix"]
        if tokenizer_json["model"][
                "type"] == "Unigram" and unk_token is not None:
            kwargs["unk_token"] = unk_token

        trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]
                                                 ["type"]]
        trainer = trainer_class(vocab_size=vocab_size,
                                special_tokens=special_tokens,
                                **kwargs)
        tokenizer.train_from_iterator(text_iterator,
                                      length=length,
                                      trainer=trainer)

        if post_processor is not None:
            trained_tokenizer_json = json.loads(tokenizer.to_str())
            # Almost done, we just have to adjust the token IDs in the post processor
            if "special_tokens" in post_processor:
                for key in post_processor["special_tokens"]:
                    tokens = post_processor["special_tokens"][key]["tokens"]
                    if special_tokens_map is not None:
                        tokens = [
                            special_tokens_map.get(token, token)
                            for token in tokens
                        ]
                    post_processor["special_tokens"][key]["tokens"] = tokens
                    post_processor["special_tokens"][key]["ids"] = [
                        tokenizer.token_to_id(token) for token in tokens
                    ]

            for special_token in ["cls", "sep"]:
                if special_token in post_processor:
                    token, _ = post_processor[special_token]
                    if special_tokens_map is not None and token in special_tokens_map:
                        token = special_tokens_map[token]
                    token_id = tokenizer.token_to_id(token)
                    post_processor[special_token] = [token, token_id]

            trained_tokenizer_json["post_processor"] = post_processor
            tokenizer = TokenizerFast.from_str(
                json.dumps(trained_tokenizer_json))

        kwargs = self.init_kwargs.copy()
        # Map pad/cls/mask token at the Transformers level
        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy(
        )
        special_tokens_list.remove("additional_special_tokens")
        for token in special_tokens_list:
            # Get the private one to avoid unnecessary warnings.
            if getattr(self, f"_{token}") is not None:
                special_token = getattr(self, token)
                if special_tokens_map is not None and special_token in special_tokens_map:
                    special_token = special_tokens_map[special_token]

                special_token_full = getattr(self, f"_{token}")
                if isinstance(special_token_full, AddedToken):
                    # Create an added token with the same parameters except the content
                    kwargs[token] = AddedToken(
                        special_token,
                        single_word=special_token_full.single_word,
                        lstrip=special_token_full.lstrip,
                        rstrip=special_token_full.rstrip,
                        normalized=special_token_full.normalized,
                    )
                else:
                    kwargs[token] = special_token

        additional_special_tokens = self.additional_special_tokens
        if new_special_tokens is not None:
            additional_special_tokens.extend(new_special_tokens)
        if len(additional_special_tokens) > 0:
            kwargs["additional_special_tokens"] = additional_special_tokens

        return self.__class__(tokenizer_object=tokenizer, **kwargs)
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                out_f.write(
                    json.dumps((examples_per_file[input_file],
                                len(label_counter))) + '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    for example, labels in g:

                        example_batch.append(example)
                        labels_batch.append(labels)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels in zip(example_batch,
                                                       labels_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                out_f.write(
                                    json.dumps([example.ids, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels in zip(example_batch, labels_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        out_f.write(json.dumps([example.ids, labels]) + '\n')
Beispiel #18
0
        return torch.tensor(self.examples[i])


configuration = BertConfig()
model = BertModel(configuration)
configuration = model.config

#tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
#trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
#tokenizer.pre_tokenizer = Whitespace()
#files = ['./processed_wiki_ko.txt']
#tokenizer.train(files=files, trainer=trainer)

#tokenizer = Tokenizer.from_file("./wiki_tokenizer.json")
#fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="wiki_tokenizer.json")
tokenizer = Tokenizer.from_file("./wiki_tokenizer.json")
tokenizer.enable_truncation(max_length=512)

#tokenizer._tokenizer.post_processor = BertProcessing(
#        single="[CLS] $A [SEP]",
#        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
#        special_tokens=[
#             ("[CLS]", tokenizer.token_to_id("[CLS]")),
#             ("[SEP]", tokenizer.token_to_id("[SEP]")),
#        ],
#)

tokenizer.post_processor = BertProcessing(sep=("[SEP]",
                                               tokenizer.token_to_id("[SEP]")),
                                          cls=("[CLS]",
                                               tokenizer.token_to_id("[CLS]")))
    "CHEF_CHECK": 6,
    "CHEF_DO": 7,
    "MOVE_CONTENTS": 8,
}
k = len(output_vocab)
with open("../data/res2idx.json", 'r') as f:
    for w, i in json.load(f).items():
        output_vocab[w] = k
        k += 1
with open("../data/arg2idx.json", 'r') as f:
    for w, i in json.load(f).items():
        output_vocab[w.replace('-', '_')] = k
        k += 1

output_vocab = {w: i for i, w in enumerate(output_vocab)}
output_tokenizer = Tokenizer(WordLevel(output_vocab, ))
output_tokenizer.pre_tokenizer = Whitespace()

t = output_tokenizer.encode_batch(
    ["SERVE MOVE_CONTENTS", "SERVE MOVE_CONTENTS PUT"])
# print (t)

csv_file = '../data/seq2seq_4335716.csv'
input_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
input_tokenizer.bos_token = input_tokenizer.cls_token
input_tokenizer.eos_token = input_tokenizer.sep_token

val_data = load_dataset('csv', data_files=csv_file, split='train[90%:]')
train_data = load_dataset('csv', data_files=csv_file, split='train[:90%]')
# print(val_data)
# print(train_data)
Beispiel #20
0
def main():
    batch_size = 4
    vocab_size = 16384
    max_source_length = 1024
    max_target_length = 1024
    num_workers = 3

    dataset = nlp.load_dataset("iwslt2017.py", "nl-en")

    # Train tokenizer
    tokenizer_filename = "tokenizer.json"
    if os.path.exists(tokenizer_filename):
        tokenizer = Tokenizer.from_file(tokenizer_filename)
    else:
        data_filename = "whole_data.txt"
        with open(data_filename, "w") as f:
            for item in dataset["train"]:
                f.write(item["source"] + "\n")
                f.write(item["target"] + "\n\n")

        tokenizer = CharBPETokenizer()
        tokenizer.train([data_filename], vocab_size=vocab_size)
        pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False)
        tokenizer.add_tokens([pad_token])
        tokenizer.save(tokenizer_filename)

    tokenizer.pad_token_id = vocab_size

    # Loaders
    train_dataset = Seq2SeqDataset(tokenizer, dataset["train"],
                                   max_source_length, max_target_length)
    val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"],
                                 max_source_length, max_target_length)
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=train_dataset.collate_fn,
        num_workers=num_workers,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        collate_fn=val_dataset.collate_fn,
        num_workers=num_workers,
    )

    # Train model
    config = BartConfig(
        vocab_size=vocab_size + 1,  # Pad
        d_model=1024,
        encoder_ffn_dim=1024,
        encoder_layers=6,
        encoder_attention_heads=4,
        decoder_ffn_dim=1024,
        decoder_layers=6,
        decoder_attention_heads=4,
    )
    model = BartForConditionalGeneration(config)
    translator = Translate(model, tokenizer)

    trainer = pl.Trainer(gpus=1)
    trainer.fit(translator, train_loader, val_loader)
Beispiel #21
0
import json

data_path = Path('/workspace/poetry2021.gt/data/pan_tadeusz5')
dataset_path = data_path / 'dataset'
vocab_path = data_path / 'vocab.json'
tokenizer_tmp_path = data_path / 'tokenizer_tmp'
tokenizer_path = data_path / 'tokenizer'

text_tokenizer = TextTokenizer(dataset_path)
text_tokenizer.load_vocab(vocab_path)

vocab = text_tokenizer.vocab
vocab_count = len(vocab.keys())
vocab.update({'<|endoftext|>': vocab_count})

tokenizer_tmp = Tokenizer(WordLevel(text_tokenizer.vocab))
tokenizer_tmp.pre_tokenizer = CharDelimiterSplit(' ')

tokenizer_tmp.post_processor = BertProcessing(
    ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")),
    ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")),
)

tokenizer_tmp_path.mkdir(parents=True, exist_ok=True)
tokenizer_tmp.save(str(tokenizer_tmp_path / "tokenizer.json"))

# Re-create as GPT2 compatible tokenizer


class GPT2CompatibleTokenizer(PreTrainedTokenizerFast):
    def save_vocabulary(self,
Beispiel #22
0
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
#from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
#tokenizer.pre_tokenizer = Whitespace()
files = ['./processed/processed_wiki_ko.txt']
tokenizer.train(files, trainer)

tokenizer.save("wiki_tokenizer.json")
    def test_padding(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # By default it does nothing when encoding single sequence
        tokenizer.enable_padding()
        output = tokenizer.encode("my name")
        assert output.tokens == ["my", "name"]

        # Can pad to the longest in a batch
        output = tokenizer.encode_batch(["my name", "my name is john"])
        assert all([len(encoding) == 4 for encoding in output])

        # Can pad to the specified length otherwise
        tokenizer.enable_padding(length=4)
        output = tokenizer.encode("my name")
        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
        output = tokenizer.encode("my name", "pair")
        assert output.tokens == ["my", "name", "pair", "[PAD]"]

        # Can get the params and give them to enable_padding
        padding = tokenizer.padding
        tokenizer.enable_padding(**padding)
Beispiel #24
0
    def __init__(
        self,
        vocab: Optional[Union[str, Dict[str, int]]] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        sep_token: Union[str, AddedToken] = "[SEP]",
        cls_token: Union[str, AddedToken] = "[CLS]",
        pad_token: Union[str, AddedToken] = "[PAD]",
        mask_token: Union[str, AddedToken] = "[MASK]",
        clean_text: bool = True,
        handle_chinese_chars: bool = True,
        strip_accents: Optional[bool] = None,
        lowercase: bool = True,
        wordpieces_prefix: str = "##",
    ):

        if vocab is not None:
            tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
        else:
            tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        tokenizer.normalizer = BertNormalizer(
            clean_text=clean_text,
            handle_chinese_chars=handle_chinese_chars,
            strip_accents=strip_accents,
            lowercase=lowercase,
        )
        tokenizer.pre_tokenizer = BertPreTokenizer()

        if vocab is not None:
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(str(cls_token))
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")

            tokenizer.post_processor = BertProcessing(
                (str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
        tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)
 def test_multiprocessing_with_parallelism(self):
     tokenizer = Tokenizer(BPE())
     multiprocessing_with_parallelism(tokenizer, False)
     multiprocessing_with_parallelism(tokenizer, True)
Beispiel #26
0
def train_tokenizer(args):
    """[summary]

    Arguments:
        args {[dictionary]} -- [arguments객체]
    """

    # Tokenizer train
    morpheme_func = None

    if args.tokenizer.pretokenizer_type == "khaiii":
        api = KhaiiiApi()
        morpheme_func = api.analyze
    elif args.tokenizer.pretokenizer_type == "mecab":
        mecab = Mecab()
        morpheme_func = mecab.morphs

    # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe"
    if args.tokenizer.tokenizer_type == "bbpe":
        # tokenizer = BytelevelBPETokenizer()
        tokenizer = Tokenizer(BPE())
        # tokenizer.pre_tokenizer = BertPreTokenizer()
        trainer = BpeTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )
    elif args.tokenizer.tokenizer_type == "cbpe":
        tokenizer = Tokenizer(BPE())
        tokenizer.pre_tokenizer = CharDelimiterSplit
        trainer = BpeTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )
    elif args.tokenizer.tokenizer_type == "wp":
        tokenizer = Tokenizer(WordPiece())
        # tokenizer.pre_tokenizer = Whitespace
        trainer = WordPieceTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )

    tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func))

    tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab")
    test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어"
    output = tokenizer.encode(test_string)
    print(f"output:{output}")
    print(f"tokens:{output.tokens}")
    print(f"ids   :{output.ids}")
    print(f"offset:{output.offsets}")
    print(f"decode:{tokenizer.decode(output.ids)}")

    datasets = get_datasets(args.tokenizer.data_path)

    for line in datasets:
        print(line)
        break
Beispiel #27
0
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
""".split("\n")

if args.type == "gpt2":
    print("Running GPT-2 tokenizer")
    tok_p = GPT2Tokenizer.from_pretrained('gpt2')

    # Create a Tokenizer using BPE
    tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
    # Use ByteLevel PreTokenizer
    tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    # Use ByteLevel Decoder
    tok_r.decoder = decoders.ByteLevel()
elif args.type == "bert":
    print("Running Bert tokenizer")
    tok_p = BertTokenizer.from_pretrained(args.vocab)

    tok_r = Tokenizer(
        WordPiece.from_files(args.vocab,
                             unk_token="[UNK]",
                             max_input_chars_per_word=100))
    tok_r.normalizer = BertNormalizer(
        clean_text=True,
        handle_chinese_chars=True,
Beispiel #28
0
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

path_data = "../../ml-datasets/wmt14/tokenizer/"

path_train_src = "../../ml-datasets/wmt14/train.en"
path_train_tgt = "../../ml-datasets/wmt14/train.de"

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    NFKC(),
    Lowercase()
])

tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet(),
                     min_frequency=2, special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>", ])
tokenizer.train(trainer, [path_train_src, path_train_tgt])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

tokenizer.model.save(path_data)

class CustomNormalizer:
    def normalize(self, normalized: NormalizedString):
        # Most of these can be replaced by a `Sequence` combining some provided Normalizer,
        # (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
        # and it should be the prefered way. That being said, here is an example of the kind
        # of things that can be done here:
        normalized.nfkc()
        normalized.filter(lambda char: not char.isnumeric())
        normalized.replace(Regex("\s+"), " ")
        normalized.lowercase()


# This section shows how to attach these custom components to the Tokenizer
tok = Tokenizer(BPE())
tok.normalizer = Normalizer.custom(CustomNormalizer())
tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
tok.decoder = Decoder.custom(CustomDecoder())

input = "永和服装饰品有限公司"
print("PreTokenize:", input)
print(tok.pre_tokenizer.pre_tokenize_str(input))
# [('永和', (0, 2)), ('服装', (2, 4)), ('饰品', (4, 6)), ('有限公司', (6, 10))]

input = "112233"
print("PreTokenize:", input)
print(tok.pre_tokenizer.pre_tokenize_str(input))
# [('1', (0, 1)), ('122', (1, 4)), ('3', (4, 5)), ('3', (5, 6))]

input = "1234 ℌ𝔢𝔩𝔩𝔬    𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹ℯ𝒶𝓇 𝕕𝕖𝕒𝕣    𝕗𝕣𝕚𝕖𝕟𝕕!"
Beispiel #30
0
                    type=str,
                    help="Path to the output directory, where the files will be saved")
parser.add_argument("--name",
                    default="bpe-bytelevel",
                    type=str,
                    help="The name of the output vocab files")
args = parser.parse_args()

files = glob.glob(args.files)
if not files:
    print(f"File does not exist: {args.files}")
    exit(1)


# Initialize an empty tokenizer
tokenizer = Tokenizer(models.BPE.empty())

# Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=False)
tokenizer.decoder = decoders.ByteLevel.new()

# And then train
trainer = trainers.BpeTrainer.new(
    vocab_size=50000,
    min_frequency=2,
    show_progress=True,
    special_tokens=[ "<s>", "<pad>", "</s>" ],
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)
tokenizer.train(trainer, files)