Example #1
0
def Tok_Train(input_file_path,vocab_size,output_path):
    """Train a Simple BPE Tokenizer"""
    GPTToken = ByteLevelBPETokenizer(lowercase=True)
    GPTToken.enable_padding()
    GPTToken.train([input_file_path],vocab_size=vocab_size,min_frequency=2,special_tokens=["PAD"])
    GPTToken.save_model(output_path)
    return None
Example #2
0
def main():
    parser = ArgumentParser(description="Training tokenizer on text files.")
    parser.add_argument("text_dir", nargs="?", help="Path to the directory containgin the text files (any .txt file).")
    parser.add_argument("-t", "--tokenizer_path", default=TOKENIZER_PATH, help="Path to the saved trained tokenizer.")
    args = parser.parse_args()
    text_dir = args.text_dir
    tokenizer_path = args.tokenizer_path
    if Path(tokenizer_path).exists():
        paths = [str(x) for x in Path(text_dir).glob("**/*.txt")]
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.pre_tokenizer = ByteLevel
        tokenizer.train(
            files=paths,
            vocab_size=config.vocab_size,
            min_frequency=2,
            special_tokens=[
                "<s>",
                "<pad>",
                "</s>",
                "<unk>",  # probably not needed if using ByteLevel pretokenization
                "<mask>",
            ]
        )
        tokenizer.save_model(tokenizer_path)
    else:
        print(f"{tokenizer_path} does not exists, will not be able to save tokenizer. Create dir first and re-run the command.")
Example #3
0
def train_tokenizer(paths, vocab_size=21128, min_frequency=2):
    """
    训练tokenizer,并保存到本地; 如果数据量大可能会很耗时.

    Args:
        paths: 训练用的文本文件目录
        vocab_size: 词典大小
        min_frequency: 出现次数小于该值的单词被过滤掉

    Returns:
        将词典保存到本地,返回分词器对象
    """
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()
    # Customize training
    tokenizer.train(files=paths,
                    vocab_size=vocab_size,
                    min_frequency=min_frequency,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])
    tokenizer.save_model("data")  # 保存分词器(其实就是个词典)

    return tokenizer
    def train_BPE_tokenizer(self) -> None:
        bytebpe_tokenizer = ByteLevelBPETokenizer()
        bytebpe_tokenizer.train(files=['./train.txt', './test.txt'],
                                vocab_size=10000,
                                special_tokens=["[PAD]"])

        bytebpe_tokenizer.save_model("nlpbook/bbpe")
Example #5
0
def train_tokenizer(data_path, wiki_text_file_path):
    # ToDo := Load if weights exists, else setup
    tokenizer_en = GPT2TokenizerFast.from_pretrained("gpt2")
    tokenizer_en.pad_token = tokenizer_en.eos_token
    vocab_size = tokenizer_en.vocab_size
    max_length = 1024

    tokenizer_es = ByteLevelBPETokenizer()
    tokenizer_es.train(
        files=[str(wiki_text_file_path)],
        vocab_size=vocab_size,
        min_frequency=2,
        special_tokens=[EOF_TOKEN]
    )
    tokenizer_es.enable_truncation(max_length=max_length)

    tokenizer_es_path = data_path/"BLBPE_tokenizer_es"
    tokenizer_es_path.mkdir(exist_ok=True, parents=True)
    tokenizer_es.save_model(str(tokenizer_es_path))

    tokenizer_es = GPT2TokenizerFast.from_pretrained(
        str(tokenizer_es_path), pad_token=EOF_TOKEN
    )
    tokenizer_es.model_max_length = max_length

    # tokenizer_es = ByteLevelBPETokenizer(
    #     vocab_file=str(tokenizer_es_path/"vocab.json"),
    #     merges_file=str(tokenizer_es_path/"merges.txt"),
    # )
    # tokenizer_es.enable_truncation(max_length=1024)

    # ToDo := is this necessary
    # tokenizer_en.pad_token = tokenizer_en.eos_token
    return tokenizer_en, tokenizer_es
Example #6
0
def train_tokenizer(
    files: Union[str, List[str]],
    dropout: float = None,
    vocab_size: int = 1000,
    min_frequency: int = 2,
    save_path: str = "",
    added_tokens: List[str] = [],
    bos_token: str = "<|endoftext|>",
    eos_token: str = "<|endoftext|>",
    unk_token: str = "<|endoftext|>",
    serialize: bool = False,
) -> None:
    """
    Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
    See: https://huggingface.co/blog/how-to-train

    For consistency, this function makes opinionated assuptions.

    :param files: path to file(s) to train tokenizer on
    :param dropout: Training dropout
    :param vocab_size: Final vocabulary size
    :param min_frequency: Minimum number of occurences to add to vocab
    :param save_path: Where to save the final tokenizer
    :param added_tokens: List of tokens to add to the tokenizer (currently not working)
    :param bos_token: Beginning-of-string special token
    :param eos_token: End-of-string special token
    :param unk_token: Unknown special token
    """

    assert isinstance(files, str) or isinstance(
        files, list), "files must be a string or a list."

    assert isinstance(added_tokens, list), "added_tokens must be a list."

    if isinstance(files, str):
        files = [files]

    tokenizer = ByteLevelBPETokenizer(dropout=dropout)

    tokenizer.train(
        files=files,
        vocab_size=vocab_size - len(added_tokens),
        min_frequency=min_frequency,
        special_tokens=[bos_token, eos_token, unk_token],
    )

    tokenizer.add_tokens(added_tokens)

    PREFIX = "aitextgen"
    save_path_str = "the current directory" if save_path == "" else save_path
    if serialize:
        logger.info(f"Saving {PREFIX}.tokenizer.json to {save_path_str}. " +
                    "You will need this file to build the GPT2Tokenizer.")
        tokenizer.save(f"{PREFIX}.tokenizer.json")
    else:
        logger.info(
            f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. "
            + "You will need both files to build the GPT2Tokenizer.")
        tokenizer.save_model(save_path, PREFIX)
def tokenize_cards(
        files=['./dataset/cards_train.txt', './dataset/cards_val.txt'],
        output_dir='./tokenizer'):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files=files, special_tokens=SPECIAL_TOKENS + OTHER_TOKENS)
    tokenizer.save_model(output_dir)
def main():
    # Instantiate argument parser
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--train_data_file",
        default=None,
        type=str,
        required=True,
        help=
        "The input training data file or a path to a directory with multiple training data files."
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        required=True,
        help="The output directory where the tokenizer model will be written.")
    # Optional parameters
    parser.add_argument("--vocab_size",
                        default=5000,
                        type=int,
                        help="Vocabulary maximum size, default 5000.")
    parser.add_argument("--min_freq",
                        default=2,
                        type=int,
                        help="Minimum number of occurrences, default 2")

    # Generate args
    args = parser.parse_args()

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Get training files
    paths = os.path.abspath(args.train_data_file)
    if not args.train_data_file.endswith(".txt"):
        paths = [str(x) for x in Path(paths).glob("**/*.txt")]

    # Customize training
    tokenizer.train(files=paths,
                    vocab_size=args.vocab_size,
                    min_frequency=args.min_freq,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])

    tokenizer.add_special_tokens(["<x>", "<z>"])

    # Save files to disk
    output_dir = os.path.abspath(args.output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    tokenizer.save_model(output_dir)
Example #9
0
def build_tokenizer(file_paths,
                    vocab_size,
                    output_file="UNKNOWN_BERT_tokenizer"):
    tokenizer = ByteLevelBPETokenizer()
    # tokenizer = SentencePieceBPETokenizer(vocab_file=None, unk_token="<unk>")
    tokenizer.train(files=file_paths,
                    vocab_size=vocab_size,
                    min_frequency=2,
                    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    tokenizer.save_model(".", output_file)
Example #10
0
 def prepare_data(self):
     if not Path(self.tokenizer_name_or_path).exists():
         tokenizer = ByteLevelBPETokenizer()
         tokenizer.train(self.files,
                         vocab_size=self.max_vocab_size,
                         min_frequency=self.min_frequency,
                         special_tokens=self.special_tokens)
         Path(self.tokenizer_name_or_path).mkdir(parents=True,
                                                 exist_ok=True)
         tokenizer.save_model(self.tokenizer_name_or_path)
Example #11
0
def train_tokenizer(
    files: Union[str, List[str]],
    dropout: float = None,
    vocab_size: int = 1000,
    min_frequency: int = 2,
    prefix: str = "aitextgen",
    save_path: str = "",
    added_tokens: List[str] = [],
    bos_token: str = "<|endoftext|>",
    eos_token: str = "<|endoftext|>",
    unk_token: str = "<|endoftext|>",
    serialize: bool = True,
    trim_offsets: bool = True,
) -> None:
    """
    Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
    See: https://huggingface.co/blog/how-to-train

    For consistency, this function makes opinionated assuptions.

    :param files: path to file(s) to train tokenizer on
    :param dropout: Training dropout
    :param vocab_size: Final vocabulary size
    :param min_frequency: Minimum number of occurences to add to vocab
    :param prefix: File name prefix of the final tokenizer
    :param save_path: Where to save the final tokenizer
    :param added_tokens: List of tokens to add to the tokenizer (currently not working)
    :param bos_token: Beginning-of-string special token
    :param eos_token: End-of-string special token
    :param unk_token: Unknown special token
    """

    assert isinstance(files, str) or isinstance(
        files, list), "files must be a string or a list."

    assert isinstance(added_tokens, list), "added_tokens must be a list."

    if isinstance(files, str):
        files = [files]

    tokenizer = ByteLevelBPETokenizer(dropout=dropout,
                                      trim_offsets=trim_offsets)

    tokenizer.train(
        files=files,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=[bos_token, eos_token, unk_token] + added_tokens,
    )

    if serialize:
        tokenizer.save(f"{prefix}.tokenizer.json")
    else:
        tokenizer.save_model(save_path, prefix)
Example #12
0
def generate_tokenizer(args):

    langs = args.languages

    if "all" in langs:
        langs = ["python", "java", "javascript", "go", "ruby", "php"]

    if args.combined:
        for size in args.sizes:
            lang = "_combined"
            paths = list(glob("data/train{}_{}.txt".format(lang, size)))

            tokenizer = ByteLevelBPETokenizer(lowercase=False)

            tokenizer.train(
                files=paths,
                vocab_size=32000,
                min_frequency=3,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ],
            )

            os.makedirs("tokenizer{}".format(lang), exist_ok=True)
            tokenizer.save_model("tokenizer{}".format(lang))

    else:
        for language in langs:
            for size in args.sizes:
                lang = "_{}".format(language)

                paths = list(glob("data/train{}_{}.txt".format(lang, size)))

                tokenizer = ByteLevelBPETokenizer(lowercase=False)

                tokenizer.train(
                    files=paths,
                    vocab_size=32000,
                    min_frequency=3,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ],
                )

                os.makedirs("tokenizer{}".format(lang), exist_ok=True)
                tokenizer.save_model("tokenizer{}".format(lang))
Example #13
0
def train_tokenizer(input_path, output_path, vocab_size=10000):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=[input_path],
                    vocab_size=vocab_size,
                    special_tokens=["[PAD]", "<s>", "</s>", "<unk>"])
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.save_model(output_path)
    return tokenizer
Example #14
0
    def create_tokenizer(self):
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.train(files=self.files,
                        vocab_size=self.vocab_size,
                        min_frequency=self.min_frequency,
                        special_tokens=self.special_tokens)
        vocab_path = os.path.join(self.save_directory)
        if not os.path.exists(vocab_path):
            os.makedirs(vocab_path)
        tokenizer.save_model(vocab_path)

        return tokenizer
def get_tokenizer(train_data, vocab_size):
    """
    Trains and returns a byte-level BPE tokenizer.
    If a cached tokenizer with these parameters exists it is loaded instead of training a new tokenizer.
    :param train_data: list of dataset files
    :param vocab_size: BPE vocab size
    :return: GPT2TokenizerFast with the requested parameters.
    """
    assert vocab_size >= 257, 'vocab size must cover all possible bytes and one special token'

    # calculate the name of the cached file
    m = hashlib.md5()
    m.update(str(vocab_size).encode())
    for file in train_data:
        m.update(file.encode())
    cache_id = m.hexdigest()

    cached_tokenizer_file = os.path.join(CACHE_DIR,
                                         'tokenizer_{}'.format(cache_id))

    train_new_tokenizer = not os.path.exists(cached_tokenizer_file)
    if train_new_tokenizer:
        start = time.time()
        os.makedirs(cached_tokenizer_file)
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.train(
            train_data,
            vocab_size=vocab_size,
            special_tokens=['<|endoftext|>'],
            show_progress=False,
        )
        tokenizer.save_model(cached_tokenizer_file)
        logger.info(f"Trained tokenizer {cached_tokenizer_file} [took %.3f s]",
                    time.time() - start)

    start = time.time()
    tokenizer = GPT2TokenizerFast.from_pretrained(cached_tokenizer_file)
    tokenizer.cache_id = cache_id

    if not train_new_tokenizer:
        logger.info(
            f"Loaded tokenizer from {cached_tokenizer_file} [took %.3f s]",
            time.time() - start)

    return tokenizer
Example #16
0
def save_tmp_tokenizer():
    paths = [str(dataset_path / 'oscar.eo.1000.txt')]

    # Initialize a tokenizer
    tokenizer_tmp = ByteLevelBPETokenizer()

    # Customize training
    tokenizer_tmp.train(files=paths, vocab_size=10_000, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])

    # Save files to disk
    tokenizer_tmp_path.mkdir(parents=True, exist_ok=True)
    tokenizer_tmp.save_model(str(tokenizer_tmp_path))
def main(args):
    # set the corpus
    random.seed(42)
    proj_dir = Path()
    tokenizers_dir = proj_dir / "tokenizers"

    if not tokenizers_dir.exists():
        tokenizers_dir.mkdir(parents=True)

    corpus_dir = proj_dir / "corpus"
    comment_dir = corpus_dir / "comment"
    source_path = comment_dir / "20190101_20200611_v2.txt"
    sample_path = comment_dir / "sample.txt"

    # sampling source
    source_io = open(source_path, mode="r", encoding="utf-8")
    sample_io = open(sample_path, mode="w", encoding="utf-8")

    for line in source_io:
        if random.random() > (1 - args.sample_rate):
            sample_io.write(line)
    else:
        sample_io.close()
        source_io.close()

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer(add_prefix_space=False)

    # Customize training
    tokenizer.train(
        files=str(sample_path),
        vocab_size=args.vocab_size,
        min_frequency=args.min_freq,
        show_progress=True,
        special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"],
    )
    tokenizer.save_model(directory=str(tokenizers_dir))
def train():
    from tokenizers import ByteLevelBPETokenizer
    '''
    Initialize a tokenizer
    '''
    tokenizer = ByteLevelBPETokenizer()
    '''
    Customize training

    Ex: Assuming I need <s> as StartOfSentence token and </s> as EndOfSentence token ,
    and <sep> token in case of seperation between subsentences etc. we specify the required special tokens. 
    These tokens are not broken into subword tokens by the tokenizer.

    '''
    paths = ['data/wiki_data.txt']
    tokenizer.train(
        files=paths,
        vocab_size=40000,
        min_frequency=2,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<sep>"])
    '''
    Save tokenizer
    '''
    tokenizer.save_model("./tok_checkpoints", "tokenizer_model")
Example #19
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        encoding = exec_properties["encoding"]
        text_token_size = exec_properties["text_token_size"]
        end_token = exec_properties["end_token"]
        model_dir = get_single_uri(input_dict["model_dir"])
        merged_text_dir = get_single_uri(input_dict["merged_text_dir"])
        encoding_dir = get_single_uri(output_dict["encoding_dir"])
        logging.info("encoding as: {}".format(encoding))
        logging.info("text token size: {}".format(text_token_size))
        logging.info("end token: {}".format(end_token))
        logging.info("model directory: {}".format(model_dir))
        logging.info("merged text directory: {}".format(merged_text_dir))
        logging.info("encoding directory: {}".format(encoding_dir))

        logging.info("Training BPE Tokenizer")
        tokenizer = ByteLevelBPETokenizer(lowercase=False,
                                          end_of_word_suffix=end_token)
        for (dirpath, _, fnames) in os.walk(merged_text_dir):
            for fname in fnames:
                file_path = os.path.join(dirpath, fname)
                if os.path.isfile(file_path):
                    logging.info("training on {}".format(file_path))
                    tokenizer.train([file_path], vocab_size=text_token_size)

        logging.info("Storing BPE Tokenizer")
        encoder_file, vocab_file = tokenizer.save_model(encoding_dir)
        os.rename(encoder_file, os.path.join(encoding_dir, "encoder.json"))
        os.rename(vocab_file, os.path.join(encoding_dir, "vocab.bpe"))
        # load hparams and store with new value
        with open(os.path.join(model_dir, 'hparams.json')) as f:
            hparams = json.load(f)
        hparams["n_vocab"] = text_token_size
        with open(os.path.join(encoding_dir, "hparams.json"),
                  'w') as json_file:
            json.dump(hparams, json_file)
Example #20
0
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

#tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>","<pad>", "</s>","<unk>", "<mask>",])
tokenizer.train(files="kant.txt",
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

tokenizer.save_model(SAVE_MODEL)

tokenizer = ByteLevelBPETokenizer(
    SAVE_MODEL + "/vocab.json",
    SAVE_MODEL + "/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(tokenizer.encode("For it is in reality vain to profess"))

config = RobertaConfig(
Example #21
0
parser = argparse.ArgumentParser()
parser.add_argument('--name', type=str)
parser.add_argument('--vocab_size', type=int)
args = parser.parse_args()

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=args.vocab_size,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

model_path = f'{current_path}../models/{args.name}'
if not os.path.exists(model_path):
    os.mkdir(model_path)
tokenizer.save_model(model_path)

config = RobertaConfig(vocab_size=args.vocab_size)

tokenizer = RobertaTokenizerFast.from_pretrained(model_path, max_len=512)
tokenizer.save_pretrained(model_path)
config.save_pretrained(model_path)
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

tokenizer.save_model("EsperBERTo")

from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./EsperBERTo/vocab.json",
    "./EsperBERTo/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
Example #23
0
# use this when training BPE tokenizer from scratch
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = ['../../data/jw300.en-tw.tw',
         '../../data/asante_twi_bible.txt']  # dataset location

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(
    files=paths,
    vocab_size=52_000,
    min_frequency=2,
    special_tokens=[
        "[CLS]",
        "[PAD]",
        "[SEP]",
        "[UNK]",
        "[MASK]",
    ]
)  # which special tokens to use for start, padding, end, unknown and mask respectively

# Save files to disk - make sure these directories exist
tokenizer.save_model("distilbako-base-akuapem-twi-cased")  # akuapem
Example #24
0
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = ['./dataset/oscar.eo.txt']

tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

tokenizer.save_model("./bert-tokenizer")
Example #25
0
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=dpath,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

# !mkdir bert-model
tokenizer.save_model("bert-model2")

from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./bert-model2/vocab.json",
    "./bert-model2/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
tokenizer.encode("Mi estas Julien.")
Example #26
0
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

#https://huggingface.co/blog/how-to-train

paths = [str(x) for x in Path("../results/").glob("**/*.txt")]
#utf-8 problem
paths = [p.encode('utf-8', 'replace').decode() for p in paths]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

# Save files to disk
tokenizer.save_model(".", "latentbert")
paths = "CDLI_Data/Sumerian_monolingual_processed.txt"
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=1,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])
tokenizer.save_model("BERT/sumerianBERTo")

tokenizer = ByteLevelBPETokenizer(
    "BERT/sumerianBERTo/vocab.json",
    "BERT/sumerianBERTo/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
tokenizer.encode("dumu a-li2-wa-aq-rum")
print(tokenizer.encode("dumu a-li2-wa-aq-rum").tokens)

# Configuration
paths = ["../results_file_clean.txt"]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

os.makedirs('roberta_we4lkd', exist_ok=True) 
tokenizer.save_model("roberta_we4lkd")

from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./roberta_we4lkd/vocab.json",
    "./roberta_we4lkd/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
Example #29
0
class WikiText2DataModule(pl.LightningDataModule):
    def __init__(self,
                 data_dir: str = 'data/wikitext-2',
                 train_batch_size: int = 64,
                 val_batch_size: int = 64,
                 dataloader_num_workers: int = 4,
                 seq_length: int = 64,
                 vocab_size=30000):
        super().__init__()
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.dataloader_num_workers = dataloader_num_workers
        self.seq_length = seq_length
        self.vocab_size = vocab_size

        self.tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)

    def prepare_data(self, *args, **kwargs):
        dataset = load_dataset("wikitext",
                               "wikitext-103-raw-v1",
                               split="train+test+validation")
        column_names = dataset.column_names

        def batch_iterator(batch_size=1000):
            for i in range(0, len(dataset), batch_size):
                yield dataset[i:i + batch_size]["text"]

        if (not os.path.exists("data/wiki-vocab.json")) or (
                not os.path.exists("data/wiki-merges.txt")):
            print('TRAIN TOKENIZER')
            self.tokenizer.train_from_iterator(batch_iterator(),
                                               vocab_size=self.vocab_size)
            self.tokenizer.save_model("data/", "wiki")
        else:
            self.tokenizer = ByteLevelBPETokenizer("data/wiki-vocab.json",
                                                   "data/wiki-merges.txt",
                                                   add_prefix_space=True)

        dataset = load_dataset("wikitext", "wikitext-103-raw-v1")

        def tokenize_function(examples):
            return {
                'input_ids':
                list(
                    map(lambda x: x.ids,
                        self.tokenizer.encode_batch(examples['text'])))
            }

        self.tokenized_dataset = dataset.map(tokenize_function,
                                             batched=True,
                                             remove_columns=column_names,
                                             num_proc=4)

    def setup(self, stage: Optional[str] = None):
        # datasets = load_dataset('text',
        #                         data_dir=self.data_dir,
        #                         data_files={'train': 'wiki.train.small.raw',
        #                                     'valid': 'wiki.valid.small.raw'})

        def group_text(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: sum(examples[k], [])
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            total_length = (total_length // self.seq_length) * self.seq_length
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + self.seq_length]
                    for i in range(0, total_length, self.seq_length)
                ]
                for k, t in concatenated_examples.items()
            }
            result["labels"] = result["input_ids"].copy()
            return result

        lm_dataset = self.tokenized_dataset.map(group_text,
                                                batched=True,
                                                num_proc=4)

        train_dataset = lm_dataset['train']
        eval_dataset = lm_dataset['validation']
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.test_dataset = lm_dataset['test']

    def collate_fn(self, features):
        batch = {}
        batch['inputs_ids'] = torch.tensor([f['input_ids'] for f in features],
                                           dtype=torch.long)
        batch['labels'] = batch['inputs_ids']
        return batch

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_dataset,
                          batch_size=self.train_batch_size,
                          collate_fn=self.collate_fn,
                          num_workers=self.dataloader_num_workers)

    def val_dataloader(self) -> DataLoader:
        return DataLoader(self.eval_dataset,
                          batch_size=self.val_batch_size,
                          collate_fn=self.collate_fn,
                          num_workers=self.dataloader_num_workers)

    def test_dataloader(self) -> DataLoader:
        return DataLoader(self.test_dataset,
                          batch_size=self.val_batch_size,
                          collate_fn=self.collate_fn,
                          num_workers=self.dataloader_num_workers)
Example #30
0
                      [train_texts, valid_texts]):
  with open(f"{data_dir}/{path}","w") as f:
    f.write(text)

paths = [str(x) for x in Path(f"{data_dir}/").glob("**/*.txt")]
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model(tokenizer_dir)

tokenizer = ByteLevelBPETokenizer(
    "tokenizer/vocab.json",
    "tokenizer/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

config = T5Config(
    vocab_size=52_000,
    max_position_embeddings=514,