def train_BPE_tokenizer(self) -> None:
        bytebpe_tokenizer = ByteLevelBPETokenizer()
        bytebpe_tokenizer.train(files=['./train.txt', './test.txt'],
                                vocab_size=10000,
                                special_tokens=["[PAD]"])

        bytebpe_tokenizer.save_model("nlpbook/bbpe")
Beispiel #2
0
def create_tokenizer(args):

    # Directory for storing
    directory = args.store_files

    # Train the tokenizer
    # paths = [str(x) for x in Path("./eo_data/").glob("**/*.txt")]
    paths = [args.file]

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(files=paths, vocab_size=args.vocab_size, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])

    # Save files to disk
    tokenizer.save(args.store_files)

    tokenizer_config = {
        "max_len": 512
    }

    with open("{}/tokenizer_config.json".format(args.store_files), 'w') as fp:
        json.dump(tokenizer_config, fp)
def _create_train_files_and_regenerate_vocab():
    print("pass")
    r = run("split -l1000000 train.txt --verbose")
    if r.ok:
        print("Train splits generated")
    if r.ok:
        try:
            shutil.rmtree("td")
        except FileNotFoundError:
            pass
        os.mkdir("td")
        r = run(
            "mv xaa td/xaa.txt | mv xab td/xbb.txt | mv xac td/xac.txt | mv xad td/xad.txt | mv xae td/xae.txt | mv xaf td/xaf.txt"
        )
        if r.ok:
            paths = [str(x) for x in Path(".").glob("td/*.txt")]
            tokenizer = ByteLevelBPETokenizer()

            # Customize training
            tokenizer.train(
                files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
            try:
                shutil.rmtree("codeBERT")
            except FileNotFoundError:
                pass
            os.mkdir("codeBERT")
            tokenizer.save("codeBERT")
Beispiel #4
0
def train_tokenizer(data_path, wiki_text_file_path):
    # ToDo := Load if weights exists, else setup
    tokenizer_en = GPT2TokenizerFast.from_pretrained("gpt2")
    tokenizer_en.pad_token = tokenizer_en.eos_token
    vocab_size = tokenizer_en.vocab_size
    max_length = 1024

    tokenizer_es = ByteLevelBPETokenizer()
    tokenizer_es.train(
        files=[str(wiki_text_file_path)],
        vocab_size=vocab_size,
        min_frequency=2,
        special_tokens=[EOF_TOKEN]
    )
    tokenizer_es.enable_truncation(max_length=max_length)

    tokenizer_es_path = data_path/"BLBPE_tokenizer_es"
    tokenizer_es_path.mkdir(exist_ok=True, parents=True)
    tokenizer_es.save_model(str(tokenizer_es_path))

    tokenizer_es = GPT2TokenizerFast.from_pretrained(
        str(tokenizer_es_path), pad_token=EOF_TOKEN
    )
    tokenizer_es.model_max_length = max_length

    # tokenizer_es = ByteLevelBPETokenizer(
    #     vocab_file=str(tokenizer_es_path/"vocab.json"),
    #     merges_file=str(tokenizer_es_path/"merges.txt"),
    # )
    # tokenizer_es.enable_truncation(max_length=1024)

    # ToDo := is this necessary
    # tokenizer_en.pad_token = tokenizer_en.eos_token
    return tokenizer_en, tokenizer_es
Beispiel #5
0
def build_tokenizer(data_path, save_path):
    r"""
        Creates a tokenizer for the Bert Model based on the given data corpus

        Args:
            data_path (:obj:`str`):
            	Path to the data corpus
            save_path (:obj:`str`):
				Path where the custom tokenizer should be saved
    """

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()
    # Customize training
    tokenizer.train(files=data_path,
                    vocab_size=52000,
                    min_frequency=2,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])
    tokenizer.save(save_path)
Beispiel #6
0
def train_tokenizer(paths, vocab_size=21128, min_frequency=2):
    """
    训练tokenizer,并保存到本地; 如果数据量大可能会很耗时.

    Args:
        paths: 训练用的文本文件目录
        vocab_size: 词典大小
        min_frequency: 出现次数小于该值的单词被过滤掉

    Returns:
        将词典保存到本地,返回分词器对象
    """
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()
    # Customize training
    tokenizer.train(files=paths,
                    vocab_size=vocab_size,
                    min_frequency=min_frequency,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])
    tokenizer.save_model("data")  # 保存分词器(其实就是个词典)

    return tokenizer
Beispiel #7
0
def main():
    parser = ArgumentParser(description="Training tokenizer on text files.")
    parser.add_argument("text_dir", nargs="?", help="Path to the directory containgin the text files (any .txt file).")
    parser.add_argument("-t", "--tokenizer_path", default=TOKENIZER_PATH, help="Path to the saved trained tokenizer.")
    args = parser.parse_args()
    text_dir = args.text_dir
    tokenizer_path = args.tokenizer_path
    if Path(tokenizer_path).exists():
        paths = [str(x) for x in Path(text_dir).glob("**/*.txt")]
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.pre_tokenizer = ByteLevel
        tokenizer.train(
            files=paths,
            vocab_size=config.vocab_size,
            min_frequency=2,
            special_tokens=[
                "<s>",
                "<pad>",
                "</s>",
                "<unk>",  # probably not needed if using ByteLevel pretokenization
                "<mask>",
            ]
        )
        tokenizer.save_model(tokenizer_path)
    else:
        print(f"{tokenizer_path} does not exists, will not be able to save tokenizer. Create dir first and re-run the command.")
Beispiel #8
0
def Tok_Train(input_file_path,vocab_size,output_path):
    """Train a Simple BPE Tokenizer"""
    GPTToken = ByteLevelBPETokenizer(lowercase=True)
    GPTToken.enable_padding()
    GPTToken.train([input_file_path],vocab_size=vocab_size,min_frequency=2,special_tokens=["PAD"])
    GPTToken.save_model(output_path)
    return None
Beispiel #9
0
def train_tokenizer(
    files: Union[str, List[str]],
    dropout: float = None,
    vocab_size: int = 1000,
    min_frequency: int = 2,
    save_path: str = "",
    added_tokens: List[str] = [],
    bos_token: str = "<|endoftext|>",
    eos_token: str = "<|endoftext|>",
    unk_token: str = "<|endoftext|>",
    serialize: bool = False,
) -> None:
    """
    Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
    See: https://huggingface.co/blog/how-to-train

    For consistency, this function makes opinionated assuptions.

    :param files: path to file(s) to train tokenizer on
    :param dropout: Training dropout
    :param vocab_size: Final vocabulary size
    :param min_frequency: Minimum number of occurences to add to vocab
    :param save_path: Where to save the final tokenizer
    :param added_tokens: List of tokens to add to the tokenizer (currently not working)
    :param bos_token: Beginning-of-string special token
    :param eos_token: End-of-string special token
    :param unk_token: Unknown special token
    """

    assert isinstance(files, str) or isinstance(
        files, list), "files must be a string or a list."

    assert isinstance(added_tokens, list), "added_tokens must be a list."

    if isinstance(files, str):
        files = [files]

    tokenizer = ByteLevelBPETokenizer(dropout=dropout)

    tokenizer.train(
        files=files,
        vocab_size=vocab_size - len(added_tokens),
        min_frequency=min_frequency,
        special_tokens=[bos_token, eos_token, unk_token],
    )

    tokenizer.add_tokens(added_tokens)

    PREFIX = "aitextgen"
    save_path_str = "the current directory" if save_path == "" else save_path
    if serialize:
        logger.info(f"Saving {PREFIX}.tokenizer.json to {save_path_str}. " +
                    "You will need this file to build the GPT2Tokenizer.")
        tokenizer.save(f"{PREFIX}.tokenizer.json")
    else:
        logger.info(
            f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. "
            + "You will need both files to build the GPT2Tokenizer.")
        tokenizer.save_model(save_path, PREFIX)
def tokenize_cards(
        files=['./dataset/cards_train.txt', './dataset/cards_val.txt'],
        output_dir='./tokenizer'):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files=files, special_tokens=SPECIAL_TOKENS + OTHER_TOKENS)
    tokenizer.save_model(output_dir)
def main():
    # Instantiate argument parser
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--train_data_file",
        default=None,
        type=str,
        required=True,
        help=
        "The input training data file or a path to a directory with multiple training data files."
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        required=True,
        help="The output directory where the tokenizer model will be written.")
    # Optional parameters
    parser.add_argument("--vocab_size",
                        default=5000,
                        type=int,
                        help="Vocabulary maximum size, default 5000.")
    parser.add_argument("--min_freq",
                        default=2,
                        type=int,
                        help="Minimum number of occurrences, default 2")

    # Generate args
    args = parser.parse_args()

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Get training files
    paths = os.path.abspath(args.train_data_file)
    if not args.train_data_file.endswith(".txt"):
        paths = [str(x) for x in Path(paths).glob("**/*.txt")]

    # Customize training
    tokenizer.train(files=paths,
                    vocab_size=args.vocab_size,
                    min_frequency=args.min_freq,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])

    tokenizer.add_special_tokens(["<x>", "<z>"])

    # Save files to disk
    output_dir = os.path.abspath(args.output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    tokenizer.save_model(output_dir)
    def train_tokenizer(self,
                        train_files,
                        tokenizer_name=None,
                        output_dir=None,
                        use_trained_tokenizer=True):
        """
        Train a new tokenizer on `train_files`.

        Args:

        - train_files: List of files to be used when training the tokenizer.

        - tokenizer_name: Name of a pretrained tokenizer or a path to a directory containing a tokenizer.

        - output_dir (optional): The directory where model files will be saved. If not given, self.args['output_dir']
        will be used.

        - use_trained_tokenizer (optional): Load the trained tokenizer once training completes.

        Returns: None
        """

        if not isinstance(train_files, list):
            train_files = [train_files]

        if not output_dir:
            output_dir = self.args["output_dir"]

        tokenizer = ByteLevelBPETokenizer()

        tokenizer.train(
            files=train_files,
            vocab_size=self.args["vocab_size"],
            min_frequency=self.args["min_frequency"],
            special_tokens=self.args["special_tokens"],
        )

        os.makedirs(output_dir, exist_ok=True)

        tokenizer.save(output_dir)
        logger.info(" Training of {} tokenizer complete. Saved to {}.".format(
            tokenizer_name, output_dir))

        _, _, tokenizer_class = MODEL_CLASSES[self.args["model_type"]]
        tokenizer = tokenizer_class.from_pretrained(output_dir)

        if use_trained_tokenizer:
            self.tokenizer = tokenizer
            self.args["tokenizer_name"] = output_dir

            try:
                model_to_resize = self.model.module if hasattr(
                    self.model, "module") else self.model
                model_to_resize.resize_token_embeddings(len(self.tokenizer))
            except AttributeError:
                pass
Beispiel #13
0
 def prepare_data(self):
     if not Path(self.tokenizer_name_or_path).exists():
         tokenizer = ByteLevelBPETokenizer()
         tokenizer.train(self.files,
                         vocab_size=self.max_vocab_size,
                         min_frequency=self.min_frequency,
                         special_tokens=self.special_tokens)
         Path(self.tokenizer_name_or_path).mkdir(parents=True,
                                                 exist_ok=True)
         tokenizer.save_model(self.tokenizer_name_or_path)
Beispiel #14
0
def build_tokenizer(file_paths,
                    vocab_size,
                    output_file="UNKNOWN_BERT_tokenizer"):
    tokenizer = ByteLevelBPETokenizer()
    # tokenizer = SentencePieceBPETokenizer(vocab_file=None, unk_token="<unk>")
    tokenizer.train(files=file_paths,
                    vocab_size=vocab_size,
                    min_frequency=2,
                    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    tokenizer.save_model(".", output_file)
Beispiel #15
0
def train_tokenizer(
    files: Union[str, List[str]],
    dropout: float = None,
    vocab_size: int = 1000,
    min_frequency: int = 2,
    prefix: str = "aitextgen",
    save_path: str = "",
    added_tokens: List[str] = [],
    bos_token: str = "<|endoftext|>",
    eos_token: str = "<|endoftext|>",
    unk_token: str = "<|endoftext|>",
    serialize: bool = True,
    trim_offsets: bool = True,
) -> None:
    """
    Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
    See: https://huggingface.co/blog/how-to-train

    For consistency, this function makes opinionated assuptions.

    :param files: path to file(s) to train tokenizer on
    :param dropout: Training dropout
    :param vocab_size: Final vocabulary size
    :param min_frequency: Minimum number of occurences to add to vocab
    :param prefix: File name prefix of the final tokenizer
    :param save_path: Where to save the final tokenizer
    :param added_tokens: List of tokens to add to the tokenizer (currently not working)
    :param bos_token: Beginning-of-string special token
    :param eos_token: End-of-string special token
    :param unk_token: Unknown special token
    """

    assert isinstance(files, str) or isinstance(
        files, list), "files must be a string or a list."

    assert isinstance(added_tokens, list), "added_tokens must be a list."

    if isinstance(files, str):
        files = [files]

    tokenizer = ByteLevelBPETokenizer(dropout=dropout,
                                      trim_offsets=trim_offsets)

    tokenizer.train(
        files=files,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=[bos_token, eos_token, unk_token] + added_tokens,
    )

    if serialize:
        tokenizer.save(f"{prefix}.tokenizer.json")
    else:
        tokenizer.save_model(save_path, prefix)
Beispiel #16
0
def save_sentense_piece_model():
    paths = [str(x) for x in Path("./data/").glob("**/*.txt")]
    print(paths)

    special_token = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=paths,
                    vocab_size=32000,
                    min_frequency=2,
                    special_tokens=special_token)
    tokenizer.save(".", "ko")
def tokenize(filename, vocab_size):

    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=filename,
                    vocab_size=vocab_size,
                    min_frequency=2,
                    special_tokens=['<|endoftext|>'])
    # '<bos>', '<eos>', '<unk>', '<pad>', '<mask>'])
    tokenizer.save(corpus)

    return tokenizer
Beispiel #18
0
def generate_tokenizer(args):

    langs = args.languages

    if "all" in langs:
        langs = ["python", "java", "javascript", "go", "ruby", "php"]

    if args.combined:
        for size in args.sizes:
            lang = "_combined"
            paths = list(glob("data/train{}_{}.txt".format(lang, size)))

            tokenizer = ByteLevelBPETokenizer(lowercase=False)

            tokenizer.train(
                files=paths,
                vocab_size=32000,
                min_frequency=3,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ],
            )

            os.makedirs("tokenizer{}".format(lang), exist_ok=True)
            tokenizer.save_model("tokenizer{}".format(lang))

    else:
        for language in langs:
            for size in args.sizes:
                lang = "_{}".format(language)

                paths = list(glob("data/train{}_{}.txt".format(lang, size)))

                tokenizer = ByteLevelBPETokenizer(lowercase=False)

                tokenizer.train(
                    files=paths,
                    vocab_size=32000,
                    min_frequency=3,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ],
                )

                os.makedirs("tokenizer{}".format(lang), exist_ok=True)
                tokenizer.save_model("tokenizer{}".format(lang))
Beispiel #19
0
def save_sentense_piece_model():
    ko_paths = ['./data/korean-english-park.dev.ko', './data/korean-english-park.train.ko']
    en_paths = ['./data/korean-english-park.dev.en', './data/korean-english-park.train.en']

    special_token = ["<pad>", "<bos>", "<eos>", "<unk>", "<mask>"]
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=ko_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token)
    tokenizer.save("./create_spm", "ko")

    tokenizer.train(files=en_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token)
    tokenizer.save("./create_spm", "en")
Beispiel #20
0
def train_tokenizer(input_path, output_path, vocab_size=10000):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=[input_path],
                    vocab_size=vocab_size,
                    special_tokens=["[PAD]", "<s>", "</s>", "<unk>"])
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.save_model(output_path)
    return tokenizer
Beispiel #21
0
    def create_tokenizer(self):
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.train(files=self.files,
                        vocab_size=self.vocab_size,
                        min_frequency=self.min_frequency,
                        special_tokens=self.special_tokens)
        vocab_path = os.path.join(self.save_directory)
        if not os.path.exists(vocab_path):
            os.makedirs(vocab_path)
        tokenizer.save_model(vocab_path)

        return tokenizer
def get_tokenizer(train_data, vocab_size):
    """
    Trains and returns a byte-level BPE tokenizer.
    If a cached tokenizer with these parameters exists it is loaded instead of training a new tokenizer.
    :param train_data: list of dataset files
    :param vocab_size: BPE vocab size
    :return: GPT2TokenizerFast with the requested parameters.
    """
    assert vocab_size >= 257, 'vocab size must cover all possible bytes and one special token'

    # calculate the name of the cached file
    m = hashlib.md5()
    m.update(str(vocab_size).encode())
    for file in train_data:
        m.update(file.encode())
    cache_id = m.hexdigest()

    cached_tokenizer_file = os.path.join(CACHE_DIR,
                                         'tokenizer_{}'.format(cache_id))

    train_new_tokenizer = not os.path.exists(cached_tokenizer_file)
    if train_new_tokenizer:
        start = time.time()
        os.makedirs(cached_tokenizer_file)
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.train(
            train_data,
            vocab_size=vocab_size,
            special_tokens=['<|endoftext|>'],
            show_progress=False,
        )
        tokenizer.save_model(cached_tokenizer_file)
        logger.info(f"Trained tokenizer {cached_tokenizer_file} [took %.3f s]",
                    time.time() - start)

    start = time.time()
    tokenizer = GPT2TokenizerFast.from_pretrained(cached_tokenizer_file)
    tokenizer.cache_id = cache_id

    if not train_new_tokenizer:
        logger.info(
            f"Loaded tokenizer from {cached_tokenizer_file} [took %.3f s]",
            time.time() - start)

    return tokenizer
Beispiel #23
0
def save_tmp_tokenizer():
    paths = [str(dataset_path / 'oscar.eo.1000.txt')]

    # Initialize a tokenizer
    tokenizer_tmp = ByteLevelBPETokenizer()

    # Customize training
    tokenizer_tmp.train(files=paths, vocab_size=10_000, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])

    # Save files to disk
    tokenizer_tmp_path.mkdir(parents=True, exist_ok=True)
    tokenizer_tmp.save_model(str(tokenizer_tmp_path))
def train_tokenizer(data_file_paths, vocab_size):
    special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
    wordpieces_prefix = None
    if tokenizer_type == 'byte':
        t = ByteLevelBPETokenizer()
    elif tokenizer_type == 'char':
        t = CharBPETokenizer()
    else:
        t = BertWordPieceTokenizer()
        special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
        wordpieces_prefix = "##"
    t.train(files=data_file_paths,
            vocab_size=vocab_size,
            min_frequency=2,
            show_progress=True,
            special_tokens=special_tokens,
            limit_alphabet=1000,
            wordpieces_prefix=wordpieces_prefix)
    return t
Beispiel #25
0
    def _bbpe(self):
        tokenizer = ByteLevelBPETokenizer(
            vocab=self.conf.vocab,
            merges=self.conf.merges,
            add_prefix_space=self.conf.add_prefix_space,
            lowercase=self.conf.lowercase,
            dropout=self.conf.dropout,
            unicode_normalizer=self.conf.unicode_normalizer,
            continuing_subword_prefix=self.conf.continuing_subword_prefix,
            end_of_word_suffix=self.conf.end_of_word_suffix,
            trim_offsets=self.conf.trim_offsets,
        )

        tokenizer.train(
            files=self.files,
            vocab_size=self.conf.vocab_size,
            min_frequency=self.conf.min_frequency,
            special_tokens=self.conf.bbpe_special_tokens,
        )

        return tokenizer
Beispiel #26
0
def main(args):
    paths = [path for path in args.input.split(":")]

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(
        files=paths,
        vocab_size=args.vocab_size,
        min_frequency=args.min_freq,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>"],
    )

    # Save files to disk
    tokenizer.save("{}.json".format(args.name), pretty=True)

    tok_spec = json.loads(tokenizer.to_str())
    with open("{}-vocab.json".format(args.name), "w") as fp:
        json.dump(tok_spec["model"]["vocab"], fp, indent=4)
    with open("{}-merges.txt".format(args.name), "w") as fp:
        fp.write("\n".join(tok_spec["model"]["merges"]))
Beispiel #27
0
def get_french_vocab(model_name):
    root = Path(os.getcwd()).parent.parent.parent
    french_corpus = "Datasets/corpora/fr/text"
    fr_corpus_path = os.path.join(root, french_corpus)
    files = []
    for dir_ in os.listdir(fr_corpus_path):
        fr_corpus_dir = os.path.join(fr_corpus_path, dir_)
        for text_file in os.listdir(fr_corpus_dir):
            text_file = os.path.join(fr_corpus_dir, text_file)
            files.append(text_file)

    tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files,
                    vocab_size=20000,
                    min_frequency=2,
                    show_progress=True,
                    special_tokens=["<sos>", "<pad>", "<eos>", "<unk>"])

    print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens)
    tokenizer.save(model_name)
def main(args):
    # set the corpus
    random.seed(42)
    proj_dir = Path()
    tokenizers_dir = proj_dir / "tokenizers"

    if not tokenizers_dir.exists():
        tokenizers_dir.mkdir(parents=True)

    corpus_dir = proj_dir / "corpus"
    comment_dir = corpus_dir / "comment"
    source_path = comment_dir / "20190101_20200611_v2.txt"
    sample_path = comment_dir / "sample.txt"

    # sampling source
    source_io = open(source_path, mode="r", encoding="utf-8")
    sample_io = open(sample_path, mode="w", encoding="utf-8")

    for line in source_io:
        if random.random() > (1 - args.sample_rate):
            sample_io.write(line)
    else:
        sample_io.close()
        source_io.close()

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer(add_prefix_space=False)

    # Customize training
    tokenizer.train(
        files=str(sample_path),
        vocab_size=args.vocab_size,
        min_frequency=args.min_freq,
        show_progress=True,
        special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"],
    )
    tokenizer.save_model(directory=str(tokenizers_dir))
Beispiel #29
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        encoding = exec_properties["encoding"]
        text_token_size = exec_properties["text_token_size"]
        end_token = exec_properties["end_token"]
        model_dir = get_single_uri(input_dict["model_dir"])
        merged_text_dir = get_single_uri(input_dict["merged_text_dir"])
        encoding_dir = get_single_uri(output_dict["encoding_dir"])
        logging.info("encoding as: {}".format(encoding))
        logging.info("text token size: {}".format(text_token_size))
        logging.info("end token: {}".format(end_token))
        logging.info("model directory: {}".format(model_dir))
        logging.info("merged text directory: {}".format(merged_text_dir))
        logging.info("encoding directory: {}".format(encoding_dir))

        logging.info("Training BPE Tokenizer")
        tokenizer = ByteLevelBPETokenizer(lowercase=False,
                                          end_of_word_suffix=end_token)
        for (dirpath, _, fnames) in os.walk(merged_text_dir):
            for fname in fnames:
                file_path = os.path.join(dirpath, fname)
                if os.path.isfile(file_path):
                    logging.info("training on {}".format(file_path))
                    tokenizer.train([file_path], vocab_size=text_token_size)

        logging.info("Storing BPE Tokenizer")
        encoder_file, vocab_file = tokenizer.save_model(encoding_dir)
        os.rename(encoder_file, os.path.join(encoding_dir, "encoder.json"))
        os.rename(vocab_file, os.path.join(encoding_dir, "vocab.bpe"))
        # load hparams and store with new value
        with open(os.path.join(model_dir, 'hparams.json')) as f:
            hparams = json.load(f)
        hparams["n_vocab"] = text_token_size
        with open(os.path.join(encoding_dir, "hparams.json"),
                  'w') as json_file:
            json.dump(hparams, json_file)
def train():
    from tokenizers import ByteLevelBPETokenizer
    '''
    Initialize a tokenizer
    '''
    tokenizer = ByteLevelBPETokenizer()
    '''
    Customize training

    Ex: Assuming I need <s> as StartOfSentence token and </s> as EndOfSentence token ,
    and <sep> token in case of seperation between subsentences etc. we specify the required special tokens. 
    These tokens are not broken into subword tokens by the tokenizer.

    '''
    paths = ['data/wiki_data.txt']
    tokenizer.train(
        files=paths,
        vocab_size=40000,
        min_frequency=2,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<sep>"])
    '''
    Save tokenizer
    '''
    tokenizer.save_model("./tok_checkpoints", "tokenizer_model")