def converted(self) -> Tokenizer:
        tokenizer_info_str = "#version:"
        token_suffix = "</w>"

        vocab = self.original_tokenizer.encoder
        merges = list(self.original_tokenizer.bpe_ranks.keys())
        if tokenizer_info_str in merges[0][0]:
            merges = merges[1:]

        tokenizer = Tokenizer(
            BPE(
                vocab,
                merges,
                dropout=None,
                unk_token=self.original_tokenizer.unk_token,
                end_of_word_suffix=token_suffix,
            ))

        tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False,
                                                          strip_accents=False)
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
        tokenizer.decoder = decoders.BPEDecoder(suffix=token_suffix)
        tokenizer.post_processor = processors.BertProcessing(
            sep=(self.original_tokenizer.sep_token,
                 self.original_tokenizer.sep_token_id),
            cls=(self.original_tokenizer.cls_token,
                 self.original_tokenizer.cls_token_id),
        )

        return tokenizer
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        # # Let the tokenizer know about special tokens if they are part of the vocab
        # if tokenizer.token_to_id(str(self.original_tokenizer.unk_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.unk_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.sep_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.sep_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.cls_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.cls_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.pad_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.pad_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.mask_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.mask_token)])

        tokenize_chinese_chars = False
        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id

        tokenizer.post_processor = processors.TemplateProcessing(
            single=
            f"{cls}:2 $A:0 {sep}:0",  # token_type_id is 2 for Funnel transformer
            pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1",
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        tokenize_chinese_chars = False
        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        question = str(self.original_tokenizer.question_token)
        dot = "."
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id
        question_token_id = self.original_tokenizer.question_token_id
        dot_token_id = self.original_tokenizer.convert_tokens_to_ids(".")

        if self.original_tokenizer.padding_side == "right":
            pair = f"{cls}:0 $A:0 {question} {dot} {sep}:0 $B:1 {sep}:1"
        else:
            pair = f"{cls}:0 $A:0 {sep}:0 $B:1 {question} {dot} {sep}:1"

        tokenizer.post_processor = processors.TemplateProcessing(
            single=f"{cls}:0 $A:0 {sep}:0",
            pair=pair,
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
                (question, question_token_id),
                (dot, dot_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        tokenize_chinese_chars = False
        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id

        tokenizer.post_processor = processors.TemplateProcessing(
            single=f"{cls}:0 $A:0 {sep}:0",
            pair=
            f"{cls}:0 $A:0 {sep}:0 {sep}:0 $B:1 {sep}:1",  # MPNet uses two [SEP] tokens
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
    def __init__(self):
        self._pre_tokenizer = pre_tokenizers.BertPreTokenizer()

        spacy.prefer_gpu()
        spacy_tokenizer = spacy.load("en_core_web_lg")
        #spacy_tokenizer.add_pipe(spacy_tokenizer.create_pipe("merge_noun_chunks"))

        #  spacy_tokenizer.add_pipe("merge_noun_chunks")
        #  spacy_tokenizer.add_pipe("merge_entities")
        # verb phrase
        # verb + preposition: 'sit up'
        pattern1 = [{
            'POS': 'VERB'
        }, {
            'POS': 'PRON',
            'OP': '?'
        }, {
            'POS': 'ADP',
            'OP': '+'
        }]

        # preposition phrase: prep + noun + prep
        pattern2 = [{'POS': 'ADP'}, {'POS': 'NOUN'}, {'POS': 'ADP'}]

        # instantiate a Matcher instance
        phrase_matcher = Matcher(spacy_tokenizer.vocab)

        #matcher.add("Verb ADP ADP phrase", None,  pattern2)
        phrase_matcher.add("Verb Phrase", None, pattern1)
        phrase_matcher.add("Preposition Phrase", None, pattern2)

        spacy_tokenizer.add_pipe(spacy_tokenizer.create_pipe("merge_entities"))

        spacy_tokenizer.add_pipe(phrase_matcher)

        self.spacy_tokenizer = spacy_tokenizer
        self.spacy_tokenizer.tokenizer = self._custom_tokenizer
        print(self.spacy_tokenizer.pipe_names)
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.encoder
        merges = list(self.original_tokenizer.bpe_ranks.keys())
        unk_token = self.original_tokenizer.unk_token

        tokenizer = Tokenizer(
            BPE(
                vocab=vocab,
                merges=merges,
                dropout=None,
                unk_token=str(unk_token),
                end_of_word_suffix="</w>",
                fuse_unk=False,
            ))

        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])

        tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
        tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")

        return tokenizer
Example #7
0
elif args.type == "bert":
    print("Running Bert tokenizer")
    tok_p = BertTokenizer.from_pretrained(args.vocab)

    tok_r = Tokenizer(
        WordPiece.from_files(args.vocab,
                             unk_token="[UNK]",
                             max_input_chars_per_word=100))
    tok_r.normalizer = BertNormalizer(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=True,
        lowercase=True,
    )
    # tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
    tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
    tok_r.decoder = decoders.WordPiece()
    tok_r.post_processor = BertProcessing(
        ("[SEP]", tok_r.token_to_id("[SEP]")),
        ("[CLS]", tok_r.token_to_id("[CLS]")),
    )
else:
    raise Exception(f"Unknown type {args.type}")


def tokenize_r():
    return tok_r.encode_batch(text)


def tokenize_p():
    return [