Beispiel #1
0
    def __init__(self, path_to_text_file: str, tokenizer_in: Tokenizer,
                 tokenizer_out: Tokenizer, max_sequence_length: int, sep: str,
                 **kwargs):

        logger.info("Processing file: {}".format(path_to_text_file))

        self.pad_token_in = tokenizer_in.get_vocab()['<PAD>']
        self.pad_token_out = tokenizer_out.get_vocab()['<PAD>']
        self.max_sequence_length = max_sequence_length

        with open(path_to_text_file, "r") as file:
            texts = file.readlines()

        texts = list(map(lambda x: x.split(sep), texts))
        texts = list(map(lambda x: x[0:2], texts))

        self.texts_in = []
        self.texts_in_length = []
        self.texts_out = []

        for i in tqdm(range(len(texts)), desc="Tokenization...."):
            text_in_ids = tokenizer_in.encode(texts[i][0]).ids
            texts_out_ids = tokenizer_out.encode(texts[i][1]).ids

            if len(text_in_ids) and len(texts_out_ids) <= max_sequence_length:
                self.texts_in.append(text_in_ids)
                self.texts_in_length.append(len(text_in_ids))
                self.texts_out.append(texts_out_ids)

        logger.info("# Texts: {}".format(len(self.texts_in)))
Beispiel #2
0
    def test_encode_add_special_tokens(self, roberta_files):
        with pytest.deprecated_call():
            tokenizer = Tokenizer(
                BPE(roberta_files["vocab"], roberta_files["merges"]))
        tokenizer.add_special_tokens(["<s>", "</s>"])

        tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
        tokenizer.post_processor = RobertaProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )

        # Can encode with special tokens
        output_with_specials = tokenizer.encode("My name is John",
                                                add_special_tokens=True)
        assert output_with_specials.tokens == [
            "<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"
        ]

        # Can encode without special tokens
        output_without_specials = tokenizer.encode("My name is John",
                                                   add_special_tokens=False)
        assert output_without_specials.tokens == [
            "ĠMy", "Ġname", "Ġis", "ĠJohn"
        ]
    def test_encode(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can encode single sequence
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["my", "name", "is", "john"]
        assert type(output.ids) == list
        assert type(output.type_ids) == list
        assert type(output.offsets) == list
        with pytest.warns(DeprecationWarning):
            assert type(output.words) == list
        assert type(output.word_ids) == list
        assert type(output.special_tokens_mask) == list
        assert type(output.attention_mask) == list
        assert type(output.overflowing) == list

        # Can encode a pair of sequences
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["my", "name", "is", "john", "pair"]
        assert isinstance(pickle.loads(pickle.dumps(output)), Encoding)

        # Can encode a single pre-tokenized sequence
        output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
        assert output.tokens == ["my", "name", "is", "john"]

        # Can encode a batch with both a single sequence and a pair of sequences
        output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
        assert len(output) == 2
Beispiel #4
0
    def test_roberta_parity(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_special_tokens(["<s>", "</s>"])
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0))

        original = tokenizer.encode("my name is john", "pair")
        tokenizer.post_processor = self.get_roberta()
        template = tokenizer.encode("my name is john", "pair")
        assert original.ids == template.ids
Beispiel #5
0
    def test_bert_parity(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))

        original = tokenizer.encode("my name", "pair")

        tokenizer.post_processor = self.get_bert()
        template = tokenizer.encode("my name", "pair")
        assert original.ids == template.ids
Beispiel #6
0
    def test_truncation(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.enable_truncation(2)

        # Can truncate single sequences
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["my", "name"]

        # Can truncate pair sequences as well
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["my", "pair"]
Beispiel #7
0
    def test_processing(self, roberta_files):
        tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"]))
        tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True)

        # Keeps original offsets
        output = tokenizer.encode("My name is John")
        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
        assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]

        # Trims offsets when activated
        tokenizer.post_processor = ByteLevel(trim_offsets=True)
        output = tokenizer.encode("My name is John")
        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
        assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
    def test_post_process(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.enable_truncation(2)
        tokenizer.enable_padding(length=4)

        encoding = tokenizer.encode("my name is john")
        pair_encoding = tokenizer.encode("pair")

        # Can post process a single encoding
        output = tokenizer.post_process(encoding)
        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]

        # Can post process a pair of encodings
        output = tokenizer.post_process(encoding, pair_encoding)
        assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
Beispiel #9
0
def train_tokenizer(args):
    """[summary]

    Arguments:
        args {[dictionary]} -- [arguments객체]
    """

    # Tokenizer train
    morpheme_func = None

    if args.tokenizer.pretokenizer_type == "khaiii":
        api = KhaiiiApi()
        morpheme_func = api.analyze
    elif args.tokenizer.pretokenizer_type == "mecab":
        mecab = Mecab()
        morpheme_func = mecab.morphs

    # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe"
    if args.tokenizer.tokenizer_type == "bbpe":
        # tokenizer = BytelevelBPETokenizer()
        tokenizer = Tokenizer(BPE())
        # tokenizer.pre_tokenizer = BertPreTokenizer()
        trainer = BpeTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )
    elif args.tokenizer.tokenizer_type == "cbpe":
        tokenizer = Tokenizer(BPE())
        tokenizer.pre_tokenizer = CharDelimiterSplit
        trainer = BpeTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )
    elif args.tokenizer.tokenizer_type == "wp":
        tokenizer = Tokenizer(WordPiece())
        # tokenizer.pre_tokenizer = Whitespace
        trainer = WordPieceTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )

    tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func))

    tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab")
    test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어"
    output = tokenizer.encode(test_string)
    print(f"output:{output}")
    print(f"tokens:{output.tokens}")
    print(f"ids   :{output.ids}")
    print(f"offset:{output.offsets}")
    print(f"decode:{tokenizer.decode(output.ids)}")

    datasets = get_datasets(args.tokenizer.data_path)

    for line in datasets:
        print(line)
        break
Beispiel #10
0
def gen(tokenizer_tgt: Tokenizer,
        model: GPT2LMHeadModel,
        device,
        prompt=None,
        n=10,
        tokenizer_eng=None,
        token_id_map=[],
        cfg={}):
    input_ids = None
    if prompt is not None and prompt.strip() != '':
        prompt = prompt.strip()
        if type(tokenizer_tgt) == Tokenizer:
            ids = [model.config.bos_token_id] + tokenizer_tgt.encode(
                prompt, None).ids
        else:
            ids = tokenizer_tgt.encode(prompt)
        input_ids = torch.LongTensor(ids).unsqueeze(0).to(device)

    for _ in range(max(n // 5, 1)):
        m = min(5, n)

        batch_ids = model.generate(input_ids=input_ids,
                                   num_return_sequences=m,
                                   max_length=200,
                                   do_sample=True,
                                   top_k=10,
                                   top_p=0.9,
                                   temperature=2.0,
                                   repetition_penalty=10.0,
                                   num_beams=10,
                                   pad_token_id=cfg['pad_token_id'],
                                   bos_token_id=cfg['bos_token_id'],
                                   eos_token_id=cfg['eos_token_id'],
                                   no_repeat_ngram_size=4)

        for i in range(m):
            ids_tgt = batch_ids[i].flatten().tolist()
            txt_tgt = tokenizer_tgt.decode(ids_tgt,
                                           skip_special_tokens=True).strip()
            if tokenizer_eng is not None:
                ids_eng = [token_id_map[i] for i in ids_tgt if i not in [1, 2]]
                txt_eng = tokenizer_eng.decode(
                    ids_eng, skip_special_tokens=True).strip()
                yield txt_tgt, txt_eng
                continue
            yield txt_tgt
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((DataTrainingArguments, CustomOthersArguments))

    (data_args, custom_args) = parser.parse_args_into_dataclasses()

    train_files = list(sorted(glob.glob(f'{data_args.train_dir}/*.{data_args.ext}')))
    validation_files = list(sorted(glob.glob(f'{data_args.eval_dir}/*.{data_args.ext}')))

    additional_special_tokens = ADDITIONAL_SPECIAL_TOKENS

    pre_tokenizer_func = PRE_TOKENIZERS_MAP.get(custom_args.pre_tokenizer_type, None)
    if pre_tokenizer_func is None:
        raise NotImplementedError
    elif custom_args.pre_tokenizer_type == 'sefr_cut':
        raise ValueError('sefr_cut is slow use fake_sefr_cu with sefr_cut_pre_tokenizer instead')

    if not os.path.exists(custom_args.output_file) or custom_args.overwrite_output_file:
        trainer = WordLevelTrainer(pre_tokenize_func=pre_tokenizer_func,
                                   vocab_size=custom_args.vocab_size,
                                   vocab_min_freq=custom_args.vocab_min_freq,
                                   input_files=train_files,
                                   additional_special_tokens=additional_special_tokens)
        trainer.count_parallel()
        trainer.save_vocab(custom_args.output_file)
    if custom_args.pre_tokenizer_type == 'fake_sefr_cut':
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            FakeSefrCustomTokenizer(PRE_TOKENIZERS_MAP['fake_sefr_cut_keep_split_token']))
    else:
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            CustomPreTokenizer(pre_tokenizer_func))
    tokenizer = Tokenizer(models.WordLevel.from_file(custom_args.output_file, unk_token='<unk>'))
    tokenizer.pre_tokenizer = custom_pre_tokenizer

    if custom_args.debug:
        print('Tokenize following text.')
        texts = ['<s>โรนัลโดเขาได้เล่นกับทีม</s>', 'โปรตุเกสมีโรนัลโด',
                 'โรนัลโดเขาได้เล่นกับทีม\nโปรตุเกสมีโรนัลโด']
        ids = [e.ids for e in tokenizer.encode_batch(texts)]
        decoded_texts = tokenizer.decode_batch(ids)
        decoded_texts = [text.replace(' ', '') for text in decoded_texts]
        for text, i, decoded_text in zip(texts, ids, decoded_texts):
            print('Text: ', text, '>>', 'Tokenized: ', i, '>>', 'Decoded: ', decoded_text)
        with open(validation_files[0], 'r') as f:
            while True:
                line = f.readline()
                if line:
                    line = line.strip()
                    if len(line) > 0 and not line.isspace():
                        encoded = tokenizer.encode(line)
                        decoded = tokenizer.decode(encoded.ids).replace(' ', '')
                        print('Text: ', line, '>>', encoded.ids, '>>', decoded)
                else:
                    break
Beispiel #12
0
    def test_processing(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_special_tokens(["<s>", "</s>"])
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0))

        output = tokenizer.encode("my name", "pair")
        assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"]
        assert output.ids == [0, 2, 3, 1, 1, 6, 1]
Beispiel #13
0
    def test_processing(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))

        output = tokenizer.encode("my name", "pair")
        assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"]
        assert output.ids == [1, 2, 3, 0, 6, 0]
Beispiel #14
0
    def test_padding(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # By default it does nothing when encoding single sequence
        tokenizer.enable_padding()
        output = tokenizer.encode("my name")
        assert output.tokens == ["my", "name"]

        # Can pad to the longest in a batch
        output = tokenizer.encode_batch(["my name", "my name is john"])
        assert all([len(encoding) == 4 for encoding in output])

        # Can pad to the specified max length otherwise
        tokenizer.enable_padding(max_length=4)
        output = tokenizer.encode("my name")
        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
        output = tokenizer.encode("my name", "pair")
        assert output.tokens == ["my", "name", "pair", "[PAD]"]
Beispiel #15
0
class TextDataset(Dataset):
    def __init__(
        self,
        path_src,
        path_tgt,
        path_tokenizer,
        path_root: Optional[str] = '',
    ):
        self.path_src = path_root + path_src
        self.path_tgt = path_root + path_tgt
        self.len = 0
        self.max_len = 512

        self.tokenizer = Tokenizer(
            BPE(
                path_root + path_tokenizer + 'vocab.json',
                path_root + path_tokenizer + 'merges.txt',
            ))
        self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()])

        with open(self.path_src, 'r+') as f:
            lines_src = f.readlines()

        with open(self.path_tgt, 'r+') as f:
            lines_tgt = f.readlines()

        self.len = len(lines_src)
        self.example = list(zip(lines_src, lines_tgt))

    def _encode(self, src_line, tgt_line):
        src = self.tokenizer.encode(str(src_line)).ids
        tgt = self.tokenizer.encode(str(tgt_line)).ids

        if len(src) > self.max_len:
            self.max_len = len(src)

        if len(tgt) > self.max_len:
            self.max_len = len(tgt)

        return torch.tensor(src), torch.tensor(tgt), len(src), len(tgt)

    def __len__(self):
        return self.len

    def __getitem__(self, i):
        return self._encode(*self.example[i])

    @staticmethod
    def pad_collate(batch):
        (x, y, x_len, y_len) = zip(*batch)

        x_pad = pad_sequence(x, batch_first=True, padding_value=0)
        y_pad = pad_sequence(y, batch_first=True, padding_value=0)

        return x_pad, y_pad, x_len, y_len
Beispiel #16
0
def tokenize_corpus(src_dir, dst_dir, tokenizer: Tokenizer, force):
    for i, doc_path in enumerate(sorted(src_dir.glob('*.txt')), start=1):
        cat = doc_path.name.replace('.txt', '')
        dst_path = dst_dir / f'{cat}.npy'
        print(f'[{i:>3,}] {cat} ({dst_path})')

        if dst_path.exists() and not force:
            print(f' > destination path {dst_path} already exists. skipping')
            continue

        token_ids = []

        print(f' > reading {doc_path}')
        if cat == 'full':
            n_lines = 500_000
            print(f'reading in chunks of {n_lines:,} lines')

            with open(doc_path) as f:
                lines = []
                for line in f:
                    lines.append(line)

                    if len(lines) >= n_lines:
                        print(f' > tokenizing {len(lines):,} lines')
                        token_ids.extend(tokenizer.encode(''.join(lines)).ids)
                        lines = []

            if len(lines) > 0:
                print(f' > tokenizing {len(lines):,}')
                token_ids.extend(tokenizer.encode(''.join(lines)).ids)

            token_ids = np.array(token_ids)
        else:
            with open(doc_path) as f:
                txt = f.read()

            print(' > tokenizing')
            token_ids = np.array(tokenizer.encode(txt).ids)

        print(f' > saving to {dst_path}')
        np.save(dst_path, token_ids)
Beispiel #17
0
class BPETokenizer(object):
    def __init__(self,
                 vocab_size=25000,
                 min_freq=5,
                 lang="en",
                 files=[None, None]) -> None:
        """

        Args:
            vocab_size: (int)
            min_freq: minimum frequency
            lang: 
            files: (List[str]) ["vocab.json", "merge.txt"]
        """
        super(BPETokenizer, self).__init__()

        self.tokenizer = Tokenizer(BPE(files[0], files[1]))

        self.lang = lang
        self.trainer = BpeTrainer(vocab_size=vocab_size,
                                  min_frequency=min_freq,
                                  special_tokens=["[PAD]", "[SEP]"],
                                  initial_alphabet=ByteLevel.alphabet())

        # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers
        self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()])
        # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()

    def train(self, files=None) -> None:

        if files is None:
            # files 長這樣:["test.txt", "train.txt", "valid.txt"]
            files = [
                f"data/wikitext-103-raw/wiki.{split}.raw"
                for split in ["test", "train", "valid"]
            ]

        self.tokenizer.train(files, self.trainer)

    def save(self) -> None:

        self.tokenizer.model.save(f"data/tokenizer/{self.lang}")

    def encode(self, input: Union[str, List[str], Tuple[str]]) -> Encoding:

        return self.tokenizer.encode(input)

    def decode(self, input: Encoding) -> str:

        # 注意 type(input) == Encoding
        return self.tokenizer.decode(input.ids)
Beispiel #18
0
    def __init__(self,
                 tokenizer: Tokenizer,
                 args,
                 file_paths: str,
                 block_size=512):
        assert all([os.path.isfile(file_path) for file_path in file_paths])

        block_size = block_size - 2  # Reduce by 2 to account for [CLS] and [SEP] tokens

        directory, filename = os.path.split(file_paths[0])
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size) +
            "_" + Path(filename).stem)

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Reading dataset at %s", file_paths)
            text = []
            for file_path in file_paths:
                with open(file_path, encoding="utf-8") as f:
                    text += f.readlines()

            logger.info("Creating features from dataset file at %s", directory)
            # Get all token IDs except [CLS] and [SEP] and flat map IDs
            tokenized_text = [
                t for tokenized in tokenizer.encode_batch(text)
                for t in tokenized.ids[1:-1]
            ]
            cls_token, sep_token = tokenizer.encode('').ids

            self.examples = []
            for i in range(0,
                           len(tokenized_text) - block_size + 1,
                           block_size):  # Truncate in block of block_size
                self.examples.append([cls_token] +
                                     tokenized_text[i:i + block_size] +
                                     [sep_token])
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.

            logger.info("Saving features into cached file %s",
                        cached_features_file)
            Path(cached_features_file).parent.mkdir(exist_ok=True,
                                                    parents=True)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #19
0
    def test_encode(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can encode single sequence
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["my", "name", "is", "john"]
        assert type(output.ids) == list
        assert type(output.type_ids) == list
        assert type(output.offsets) == list
        assert type(output.words) == list
        assert type(output.special_tokens_mask) == list
        assert type(output.attention_mask) == list
        assert type(output.overflowing) == list

        # Can encode a pair of sequences
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["my", "name", "is", "john", "pair"]

        # Can encode a batch with both a single sequence and a pair of sequences
        output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
        assert len(output) == 2
Beispiel #20
0
    def test_train_with_special_tokens(self):
        filename = "tests/data/dummy-unigram-special_tokens-train.txt"
        with open(filename, "w") as f:
            f.write(
                """
[CLS] The Zen of Python, by Tim Peters [SEP]
[CLS] Beautiful is better than ugly. [SEP]
[CLS] Explicit is better than implicit. [SEP]
[CLS] Simple is better than complex. [SEP]
[CLS] Complex is better than complicated. [SEP]
[CLS] Flat is better than nested. [SEP]
[CLS] Sparse is better than dense. [SEP]
[CLS] Readability counts. [SEP]
[CLS] Special cases aren't special enough to break the rules. [SEP]
[CLS] Although practicality beats purity. [SEP]
[CLS] Errors should never pass silently. [SEP]
[CLS] Unless explicitly silenced. [SEP]
[CLS] In the face of ambiguity, refuse the temptation to guess. [SEP]
[CLS] There should be one-- and preferably only one --obvious way to do it. [SEP]
[CLS] Although that way may not be obvious at first unless you're Dutch. [SEP]
[CLS] Now is better than never. [SEP]
[CLS] Although never is often better than *right* now. [SEP]
[CLS] If the implementation is hard to explain, it's a bad idea. [SEP]
[CLS] If the implementation is easy to explain, it may be a good idea. [SEP]
[CLS] Namespaces are one honking great idea -- let's do more of those! [SEP]
            """
            )

        tokenizer = Tokenizer(models.Unigram())
        trainer = trainers.UnigramTrainer(
            show_progress=False, special_tokens=["[PAD]", "[SEP]", "[CLS]"], unk_token="[UNK]"
        )

        tokenizer.train([filename], trainer=trainer)

        assert tokenizer.encode("[CLS] This is a test [SEP]").tokens == [
            "[CLS]",
            " T",
            "h",
            "i",
            "s",
            " is ",
            "a",
            " ",
            "te",
            "s",
            "t ",
            "[SEP]",
        ]
    def test_truncation(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.enable_truncation(2)

        # Can truncate single sequences
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["my", "name"]

        # Can truncate pair sequences as well
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["my", "pair"]

        # Can get the params and give them to enable_truncation
        trunc = tokenizer.truncation
        tokenizer.enable_truncation(**trunc)

        # Left truncation direction
        tokenizer.enable_truncation(2, direction="left")
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["is", "john"]

        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["john", "pair"]
Beispiel #22
0
def wordpiece_tokenize(line):
    tokenizer = Tokenizer(WordPiece(wordpiece_dict3))
    tokenizer.enable_padding(length=200)
    tokenizer.enable_truncation(max_length=200)
    tokenizer.pre_tokenizer = WhitespaceSplit()
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    output = tokenizer.encode(line)
    return (output.ids)
def main(args):
    if args.do_train:
        # Initialize a tokenizer
        files = get_smi_files(args.training_files)
        print("Training BPE tokenizer using the following files:{}".format(
            files))
        tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
        tokenizer.enable_padding(pad_id=args.vocab_size + 2,
                                 pad_token="<pad>",
                                 length=args.pad_len)
        tokenizer.enable_truncation(max_length=args.pad_len,
                                    strategy='only_first')
        tokenizer.normalizer = Sequence([NFKC()])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=False)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
        # Train the tokenizer
        trainer = trainers.BpeTrainer(show_progress=True,
                                      vocab_size=args.vocab_size,
                                      min_frequency=args.min_frequency)
        tokenizer.train(files, trainer=trainer)
        tokenizer.add_tokens(["<start>", "<end>"])
        tokenizer.save(os.path.join('tokenizers', args.tokenizer_name),
                       pretty=True)
        print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

    if args.do_test:
        # Test the tokenizer
        tokenizer = Tokenizer.from_file(
            os.path.join('tokenizers', args.tokenizer_name))
        print("Testing with SMILES String: {}".format(args.test_string))
        encoding = tokenizer.encode(args.test_string)
        print("Encoded string: {}".format(encoding.tokens))
        print(encoding.ids)
        decoded = tokenizer.decode(encoding.ids)
        print("Decoded string: {}".format(decoded))
Beispiel #24
0
class LitTokenizer:
    def __init__(self,
                 padding=False,
                 truncation=False,
                 max_length=None,
                 lower=False,
                 lang=None):
        super().__init__()
        self.UNK_WORD = '[UNK]'
        self.PAD_WORD = '[PAD]'
        self.MASK_WORD = '[MASK]'
        self.SOS_WORD = '[SOS]'
        self.EOS_WORD = '[EOS]'
        self.special_tokens = [
            self.UNK_WORD, self.PAD_WORD, self.MASK_WORD, self.SOS_WORD,
            self.EOS_WORD
        ]

        # Define tokenizer
        self.tokenizer = None
        self.configure_tokenizers(padding, truncation, max_length, lower)

        # Other
        self.lang = lang

    def get_vocab_size(self):
        return self.tokenizer.get_vocab_size()

    def configure_tokenizers(self, padding, truncation, max_length, lower):
        # Settings
        pad_length = None
        if padding in {True, "longest"}:
            pass
        elif padding in {"max_length"}:
            pad_length = max_length
        elif padding in {False, "do_not_pad"}:
            pass
        else:
            raise ValueError("Unknown padding type")

        # SRC tokenizer
        tok_normalizers = [NFD(), Strip()]
        if lower:
            tok_normalizers += [Lowercase()]

        self.tokenizer = Tokenizer(tok_model())  # unk_token=... not working
        self.tokenizer.add_special_tokens(self.special_tokens)
        self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [WhitespaceSplit()])
        self.tokenizer.normalizer = normalizers.Sequence(
            tok_normalizers)  # StripAccents requires NFD
        self.tokenizer.decoder = tok_decoder()

        # Define template (Needed for the sos/eos tokens)
        basic_template = TemplateProcessing(
            single=f"{self.SOS_WORD} $A {self.EOS_WORD}",
            pair=
            f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}",
            special_tokens=[
                (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)),
                (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD))
            ],
        )
        self.tokenizer.post_processor = basic_template

        if padding:
            self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id(
                self.PAD_WORD),
                                          pad_token=self.PAD_WORD,
                                          length=pad_length)
        if truncation:
            self.tokenizer.enable_truncation(max_length,
                                             stride=0,
                                             strategy='longest_first')

    def load_vocab(self, vocab, merges):
        vocab, merges = tok_model.read_file(vocab, merges)
        self.tokenizer.model = tok_model(vocab, merges)

    def train_vocab(self, files, vocab_size=32000, min_frequency=3):
        # Train trainer
        trainer = tok_trainer(vocab_size=vocab_size,
                              min_frequency=min_frequency)
        self.tokenizer.train(files, trainer)

    def save_vocab(self, output_dir, prefix):
        self.tokenizer.model.save(output_dir, prefix)

    def pad(self, examples, keys=None):
        pad_idx = self.special_tokens.index(self.PAD_WORD)

        # Keys to modify
        if not keys:
            keys = list(examples[0].keys())

        d = {}
        for k in keys:
            # Collect same-type items (list of IDs, list of masks,...)
            d[k] = [x[k] for x in examples]

            # Get max length (value to pad)
            max_length = max([x.shape[-1] for x in d[k]])

            # Apply padding
            for i, x in enumerate(examples):
                unpadded_t = x[k]
                if k == "ids":
                    tmp = torch.full((max_length, ),
                                     fill_value=pad_idx,
                                     device=unpadded_t.device)  # All padding
                elif k == "attention_mask":
                    tmp = torch.full(
                        (max_length, ), fill_value=0,
                        device=unpadded_t.device)  # No attention mask
                else:
                    raise TypeError("Unknown key")
                tmp[:unpadded_t.shape[-1]] = unpadded_t
                d[k][i] = tmp
        return d

    def encode(self, x):
        return self.tokenizer.encode(x)

    def decode(self, x):
        if isinstance(x, torch.Tensor):
            assert len(x.shape) == 2
            x = x.detach().cpu().numpy()
        return [self.tokenizer.decode(x_i) for x_i in x]
Beispiel #25
0
char_map = {k: v + 1 for k, v in bpe_vocab.items() if len(k) == 1}
print(f"Char map size: {len(char_map)}\n")

MAX_LEN_OF_WORD = max([len(w) for w in bpe_vocab])
print(f"Max length of word: {MAX_LEN_OF_WORD}\n")

if ZERO_PAD:
    word_map = {
        k: [char_map[c] for c in k] + [0] * (MAX_LEN_OF_WORD - len(k))
        for k in bpe_vocab
    }
else:
    word_map = {k: [char_map[c] for c in k] for k in bpe_vocab}

name_bpe_words = {n: tokenizer.encode(w).tokens for n, w in name_words.items()}
MAX_LEN_OF_MENTION = max([len(v) for v in name_bpe_words.values()])
print(f"Max length of names: {MAX_LEN_OF_MENTION}\n")

gen_data = GenData(name_bpe_words, word_map, bpe_vocab)
data_sets = gen_data(data)
# train_data, test_data, dev_data = [
#     dict(dt) for dt in list(gen_data(data).values())]
train_data, test_data, dev_data = [
    data_sets[k] for k in ['train', "test", "dev"]
]

global_var = {
    "MAX_LEN_OF_MENTION": MAX_LEN_OF_MENTION,
    "MAX_LEN_OF_WORD": MAX_LEN_OF_WORD
}
Beispiel #26
0
class MLMPreprocessor:
    def __init__(
        self,
        load_from: str = None,
        vocab_size: int = 10000,
        max_example_len: int = 128,
        batch_size: int = 16,
        num_stopwords: int = 250,
        mask_output_len: int = 4,
    ):
        self.char_dict: Dict[str, int] = {}
        self.char_rev: Dict[int, str] = {}
        self.token_dict: Dict[str, int] = {}
        self.token_rev: Dict[str, int] = {}
        self.vocab_size = vocab_size
        self.max_example_len = max_example_len
        self.batch_size = batch_size
        self.num_stopwords = num_stopwords
        self.mask_output_len = mask_output_len
        self.tokenizer_fit = False
        self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        self.tokenizer.pre_tokenizer = Whitespace()
        self.tokenizer.normalizer = Sequence(
            [NFD(), Lowercase(), StripAccents()])
        self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"],
                                      vocab_size=self.vocab_size)
        if load_from:
            self._load(load_from)

    def fit(self,
            data: List[str],
            min_char_freq: int = 1,
            progbar: bool = True):
        """
        Create a character-level dictionary based on a list of strings
        """
        if not self.tokenizer_fit:
            self.tokenizer.train_from_iterator(data, trainer=self.tok_trainer)
        char_counter: Counter = Counter()
        token_counter: Counter = Counter()
        iterator_: Iterable = data
        if progbar:
            iterator_ = tqdm(data)
        for example in iterator_:
            chars = Counter(example)
            for char, char_count in chars.items():
                try:
                    char_counter[char] += char_count
                except KeyError:
                    char_counter[char] = char_count

        counts = [k for k, v in char_counter.items() if v >= min_char_freq]
        self.char_rev = {0: "", 1: "?", 2: "?", 3: ""}

        for c in sorted(counts):
            n = len(self.char_rev)
            self.char_rev[n] = c
            self.char_dict[c] = n

    def scrub(self, token):
        """
        Normalize a token by removing punctuation. Used to build a vocabulary
        and to choose tokens to mask during pretraining.
        """
        token = token.lower()
        while len(token) > 0 and token[0] in PUNCT:
            token = token[1:]
        while len(token) > 0 and token[-1] in PUNCT:
            token = token[:-1]
        token = re.sub("\d", "#", token)
        return token

    def tokenize(self, string_to_tokenize: str) -> Encoding:
        string_to_tokenize = string_to_tokenize
        return self.tokenizer.encode(string_to_tokenize)

    def string_to_array(self, string_in, length, padding_pre=False):
        # truncate
        if padding_pre:
            s = string_in[-length:]
        else:
            s = string_in[:length]
        # map char -> int and left-zero-pad
        mapped = np.ones((len(s)))
        for n, char in enumerate(s):
            try:
                mapped[n] = self.char_dict[char]
            except KeyError:
                pass
        # mapped = [self.char_dict.get(x, 1) for x in s]
        if padding_pre:
            r = np.pad(mapped, (length - len(s), 0),
                       "constant",
                       constant_values=(0, 0))
        else:
            r = np.pad(mapped, (0, length - len(s)),
                       "constant",
                       constant_values=(0, 0))
        return r

    def string_to_example(self,
                          example,
                          return_example=False,
                          allow_null_examples=False):
        # simple tokenization
        sp = [tok.strip() for tok in example.split(" ") if tok.strip() != ""]
        # normalize to see what we can replace
        normed = [self.scrub(tok) for tok in sp]
        # see which tokens are in the vocabulary
        replaceable_tokens = [(t, i) for i, t in enumerate(normed)
                              if t in self.token_dict]
        assert (
            len(sp) >= 2
        ), "minimum length of an example is 2 tokens (white-space delimited)"
        assert (
            len(replaceable_tokens) > 0 or allow_null_examples
        ), f"called string_to_example on string with no tokens that are in the vocabulary and allow_null_examples=True\n{example}"
        if len(replaceable_tokens) == 0 and allow_null_examples:
            return None
        # choose a token to replace
        rep_ind = np.random.randint(0, len(replaceable_tokens))

        rep, rep_ind = replaceable_tokens[rep_ind]

        # get the index of the token for the output
        mask_ind = self.token_dict[rep]
        label_array = np.zeros(self.vocab_size)
        label_array[mask_ind] = 1

        # piece the masked input back together
        left = " ".join(sp[:rep_ind])
        right = " ".join(sp[rep_ind + 1:]) if len(sp) > rep_ind + 1 else ""
        left_len = len(left)
        right_len = len(right)
        thresh = (self.max_example_len - self.mask_output_len - 2) // 2
        right_diff = thresh - left_len if left_len < thresh else 0
        left_diff = thresh - right_len if right_len < thresh else 0
        left_sub = left[-(thresh + left_diff):]
        right_sub = right[:(thresh + right_diff)]
        combo = left_sub + " " + "?" * 4 + " " + right_sub
        encoded = self.string_to_array(combo, self.max_example_len)
        ret_val = [encoded, label_array]
        if return_example:
            ret_val += [combo]
        return ret_val

    def strings_to_examples(self, strings):
        enc = np.zeros((len(strings), self.max_example_len))
        labels = np.zeros((len(strings), self.vocab_size))
        for n in range(len(strings)):
            enc[n], labels[n] = self.string_to_example(strings[n])
        return [enc, labels]

    def examples_generator(self, strings):
        assert len(strings) >= 1
        while True:
            ind = 0
            while ind < len(strings):
                yield self.strings_to_examples(strings[ind:ind +
                                                       self.batch_size])

                ind += self.batch_size

    def save(self, path):
        """
        Write a Preprocessor object to a .JSON config
        """
        config = {
            "char_rev": self.char_rev,
            "char_dict": self.char_dict,
            "max_example_len": self.max_example_len,
        }
        with open(os.path.join(path, "cclm_config.json"), "w") as f:
            json.dump(config, f)

    def _load(self, path):
        """
        Load a Preprocessor object from disk
        """
        with open(path, "rb") as f:
            result = json.load(f)
        for key, value in result.items():
            setattr(self, key, value)
Beispiel #27
0
# https://github.com/huggingface/tokenizers/tree/master/bindings/python#train-a-new-tokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE())
# Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

# TODO True

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
trainer = trainers.BpeTrainer(vocab_size=10000, min_frequency=2)
tokenizer.train(trainer, ['bar'])

encoded = tokenizer.encode(seq)
print(encoded.tokens)

# TODO: Use a clustered set of proteins like UniRef50
# -- https://www.uniprot.org/help/uniref

# TODO: Use an LSTM to train on sequences, then freeze early layers and add
# classification backend, retrain.

# https://github.com/huggingface/tokenizers/tree/master/bindings/python
# https://github.com/huggingface/tokenizers/tree/master/bindings/python#provided-tokenizers
from tokenizers import CharBPETokenizer

tokenizer = CharBPETokenizer(bert_normalizer=False)
tokenizer.train(['./bar'], vocab_size=1000, min_frequency=2)
# tokenizer.encode(seq).tokens
Beispiel #28
0
class SentencePieceBPETokenizer:
    """Custom SentencePiece tokenizer"""
    unk_token = '<unk>'
    pad_token = '<pad>'

    def __init__(self,
                 vocab: Dict[str, int] = None,
                 merges: List[Tuple[str, str]] = None,
                 dropout: float = None,
                 max_length: Optional[int] = 64) -> None:
        """Constructor

        Args:
            vocab (Dict[str, int]): A dictionary of string keys and their ids.
            merges (List[Tuple[str, str]]): A list of pairs of tokens.
            dropout (float): BPE dropout
            max_length (int, optional): The max length at which to truncate.
                Defaults to `64`.
        """
        self.tokenizer = Tokenizer(
            BPE(vocab, merges, dropout=dropout, unk_token=self.unk_token))
        self.tokenizer.normalizer = BertNormalizer()  # noqa
        self.tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()  # noqa
        self.tokenizer.decoder = decoders.Metaspace()  # noqa
        self.tokenizer.add_special_tokens([self.pad_token, self.unk_token])

        self.tokenizer.enable_padding(pad_token=self.pad_token)
        self.tokenizer.enable_truncation(max_length)

    @classmethod
    def train(cls,
              dataset: Sequence[str],
              vocab_size: int = 1000,
              min_frequency: int = 2,
              dropout: float = 0.0,
              max_length: Optional[int] = 64) -> 'SentencePieceBPETokenizer':
        instance = cls(dropout=dropout, max_length=max_length)
        trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=[cls.pad_token, cls.unk_token])
        instance.tokenizer.train_from_iterator(dataset, trainer=trainer)
        instance.tokenizer.model.dropout = None
        return instance

    @property
    def vocab_size(self):
        return len(self.tokenizer.get_vocab())

    def serialize(self):
        return self.tokenizer.to_str()

    @classmethod
    def deserialize(cls, s: str) -> 'SentencePieceBPETokenizer':
        tokenizer = cls()
        tokenizer.tokenizer = Tokenizer.from_str(s)
        return tokenizer

    def encode(self, text: str) -> Dict[str, Any]:
        encoding = self.tokenizer.encode(text)
        outputs = {
            'ids': torch.tensor(encoding.ids),
            'mask': torch.tensor(encoding.attention_mask),
            'spans': encoding.offsets,
        }
        return outputs

    def encode_batch(self, batch: List[str]):
        encodings = self.tokenizer.encode_batch(batch)
        outputs = {
            'ids': torch.tensor([e.ids for e in encodings]),
            'mask': torch.tensor([e.attention_mask for e in encodings]),
            'spans': [e.offsets for e in encodings],
        }
        return outputs
        # instantiate trainer
        trainer = BpeTrainer(
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
            min_frequency=2)

        # get files
        files = [os.path.join(args.data_dir, f"{f}-sentences.txt")]

        # train tokenizer
        tokenizer.train(files=files, trainer=trainer)

        # save tokenizer config file
        tokenizer.save(os.path.join(args.save_dir, f"tokenizer-{f}.json"))

    # load trained tokenizers
    for f in ['ewe-fon', "ewe", "fon"]:
        print(f'Using {f} tokenizer : \n')
        try:
            tokenizer = Tokenizer.from_file(
                os.path.join(args.save_dir, f"tokenizer-{f}.json"))
            output = tokenizer.encode(
                "Gbadanu tɛgbɛ ɔ, Noah tuun ɖɔ e nɔ cɛ emi")
            print(output.tokens)
            print(output.ids)
            print(output.offsets[9])
        except Exception as ex:
            print(ex)

        print("\n")
Beispiel #30
0
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer



ids = bert_tokenizer.encode(sentences[10]).ids
bert_tokenizer.decode(ids)


from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers

tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoders = decoders.ByteLevel()

trainer = trainers.UnigramTrainer(
    vocab_size=20000,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=["<PAD>", "<BOS>", "<EOS>"],
)

tokenizer.train_from_iterator(sentences, trainer=trainer)
tokenizer.encode(sentences[4]).ids
tokenizer.decode(tokenizer.encode(sentences[4]).ids)
tokenizer.save('bert_out/test2')

tokenizer.save_pretrained('bert_out/test')