Example #1
0
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./roberta-lm/vocab.json",
            "./roberta-lm/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/montecristo/").glob("**/*.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
Example #2
0
    def __init__(self, file_path: str = None, tokenizer_path: str = None):
        tokenizer = ByteLevelBPETokenizer(
            tokenizer_path + "/vocab.json",
            tokenizer_path + "/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        with open(file_path, encoding="utf-8") as f:
            lines = f.readlines()
            lines = [
                line for line in lines
                if (len(line) > 0 and not line.isspace())
            ]
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
Example #3
0
    def __init__(self, evaluate=False):
        tokenizer = ByteLevelBPETokenizer(
            "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-vocab.json",
            "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []
        if evaluate:
            src_files = ["/home/zheng/sde/data/valid.txt"]
        else:
            src_files = ["/home/zheng/sde/data/test.txt"]

        for src_file in src_files:
            print(src_file)
            f = open(src_file, 'r')
            lines = f.readlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]