Ejemplo n.º 1
0
 def test_tokenizer_no_lower(self):
     tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
     tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
     self.assertListEqual(
         tokens,
         [
             SPIECE_UNDERLINE + "I",
             SPIECE_UNDERLINE + "was",
             SPIECE_UNDERLINE + "b",
             "or",
             "n",
             SPIECE_UNDERLINE + "in",
             SPIECE_UNDERLINE + "",
             "9",
             "2",
             "0",
             "0",
             "0",
             ",",
             SPIECE_UNDERLINE + "and",
             SPIECE_UNDERLINE + "this",
             SPIECE_UNDERLINE + "is",
             SPIECE_UNDERLINE + "f",
             "al",
             "se",
             ".",
         ],
     )
Ejemplo n.º 2
0
 def test_tokenizer_no_lower(self):
     tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
     tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
     self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or',
                                   u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
                                   u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                                   SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
Ejemplo n.º 3
0
    def load(cls,
             pretrained_model_name_or_path,
             tokenizer_class=None,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param kwargs:
        :return: Tokenizer
        """

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        # guess tokenizer type from name
        if tokenizer_class is None:
            if "albert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "AlbertTokenizer"
            elif "xlm-roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLMRobertaTokenizer"
            elif "roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "RobertaTokenizer"
            elif "distilbert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DistilBertTokenizer"
            elif "bert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "xlnet" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLNetTokenizer"
            else:
                raise ValueError(
                    f"Could not infer tokenizer_type from name '{pretrained_model_name_or_path}'. Set arg `tokenizer_type` in Tokenizer.load() to one of: 'bert', 'roberta', 'xlnet' "
                )
            logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        if tokenizer_class == "AlbertTokenizer":
            ret = AlbertTokenizer.from_pretrained(
                pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            ret = XLMRobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "RobertaTokenizer":
            ret = RobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DistilBertTokenizer":
            ret = DistilBertTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "BertTokenizer":
            ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                 keep_accents=True,
                                                 **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Ejemplo n.º 4
0
def xlnetTokenizer(*args, **kwargs):
    """
    Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
    Peculiarities:
        - require Google sentencepiece (https://github.com/google/sentencepiece)

    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * xlnet-large-cased
    Keyword args:
    special_tokens: Special tokens in vocabulary that are not pretrained
                    Default: None
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying model's
             sequence length.
             Default: None

    Example:
        >>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')

        >>> text = "Who was Jim Henson ?"
        >>> indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
Ejemplo n.º 5
0
    def test_full_tokenizer(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize(u'This is a test')
        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [285, 46, 10, 170, 382])

        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
        self.assertListEqual(tokens, [
            SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was',
            SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in',
            SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',',
            SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
            u'é', u'.'
        ])
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(ids, [
            8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72,
            80, 6, 0, 4
        ])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(back_tokens, [
            SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was',
            SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in',
            SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
            SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
            u'<unk>', u'.'
        ])
Ejemplo n.º 6
0
    def test_sequence_builders(self):
        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

        text = tokenizer.encode("sequence builders", add_special_tokens=False)
        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)

        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)

        assert encoded_sentence == text + [4, 3]
        assert encoded_pair == text + [4] + text_2 + [4, 3]
Ejemplo n.º 7
0
    def test_sequence_builders(self):
        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

        text = tokenizer.encode("sequence builders")
        text_2 = tokenizer.encode("multi-sequence build")

        encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
        encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)

        assert encoded_sentence == text + [4, 3]
        assert encoded_pair == text + [4] + text_2 + [4, 3]
    def setUp(self):
        super().setUp()

        # We have a SentencePiece fixture for testing
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
        tokenizer.sanitize_special_tokens()
        tokenizer.save_pretrained(self.tmpdirname)
Ejemplo n.º 9
0
    def test_full_tokenizer(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)

        with TemporaryDirectory() as tmpdirname:
            tokenizer.save_pretrained(tmpdirname)

            input_text = u"This is a test"
            output_text = u"This is a test"

            create_and_check_tokenizer_commons(self, input_text, output_text,
                                               XLNetTokenizer, tmpdirname)

            tokens = tokenizer.tokenize(u'This is a test')
            self.assertListEqual(tokens,
                                 [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])

            self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                                 [285, 46, 10, 170, 382])

            tokens = tokenizer.tokenize(
                u"I was born in 92000, and this is falsé.")
            self.assertListEqual(tokens, [
                SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was',
                SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in',
                SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',',
                SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
                u'é', u'.'
            ])
            ids = tokenizer.convert_tokens_to_ids(tokens)
            self.assertListEqual(ids, [
                8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46,
                72, 80, 6, 0, 4
            ])

            back_tokens = tokenizer.convert_ids_to_tokens(ids)
            self.assertListEqual(back_tokens, [
                SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was',
                SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in',
                SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
                SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
                u'<unk>', u'.'
            ])
Ejemplo n.º 10
0
    def setUp(self):
        super(XLNetTokenizationTest, self).setUp()

        # We have a SentencePiece fixture for testing
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
        tokenizer.save_pretrained(self.tmpdirname)
Ejemplo n.º 11
0
def read_instance(input_file, word_alphabet, biword_alphabet, label_alphabet,
                  number_normalized, max_sent_length, bertpath):
    tokenizer = BertTokenizer.from_pretrained(bertpath, do_lower_case=True)
    xlnet_tokenizer = XLNetTokenizer.from_pretrained(
        'transformer_cpt/chinese_xlnet_base_pytorch/',
        add_special_tokens=False)
    in_lines = open(input_file, 'r', encoding="utf-8").readlines()
    instence_texts = []
    instence_Ids = []
    word_Ids = []
    biword_Ids = []
    label_Ids = []

    words = []
    biwords = []
    labels = []

    for idx in range(len(in_lines)):
        line = in_lines[idx]
        if len(line) > 2:
            pairs = line.strip().split('\t')
            if len(pairs) == 1:
                word = ' '
                # print('word ==  ')
            else:
                word = pairs[0]
            if number_normalized:
                word = normalize_word(word)
            label = pairs[-1]
            if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2:
                biword = word + in_lines[idx + 1].strip().split()[0]
                # todo
                biword = normalize_word(biword)
            else:
                biword = word + NULLKEY
            biwords.append(biword)
            words.append(word.lower())
            labels.append(label)
            word_Ids.append(word_alphabet.get_index(word.lower()))
            biword_index = biword_alphabet.get_index(biword.lower())
            biword_Ids.append(biword_index)
            label_Ids.append(label_alphabet.get_index(label))

        else:
            # todo 这里直接截断了,做医疗相关不能这么干
            if len(words) <= 0:
                raise ValueError('len(words) <= 0')
            texts = ['[CLS]'] + words[:max_sent_length] + ['[SEP]']

            bert_text_ids = tokenizer.convert_tokens_to_ids(texts)
            xlnet_text_ids = xlnet_tokenizer.convert_tokens_to_ids(
                words[:max_sent_length])
            instence_texts.append([words, biwords, labels])

            word_Ids = word_Ids[:max_sent_length]
            biword_Ids = biword_Ids[:max_sent_length]
            label_Ids = label_Ids[:max_sent_length]

            assert len(texts) - 2 == len(word_Ids)
            instence_Ids.append([
                word_Ids, biword_Ids, label_Ids, bert_text_ids, xlnet_text_ids
            ])

            words = []
            biwords = []
            labels = []

            word_Ids = []
            biword_Ids = []
            label_Ids = []

    return instence_texts, instence_Ids
Ejemplo n.º 12
0
    def load(cls,
             pretrained_model_name_or_path,
             tokenizer_class=None,
             use_fast=False,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        # guess tokenizer type from name
        if tokenizer_class is None:
            if "albert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "AlbertTokenizer"
            elif "xlm-roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLMRobertaTokenizer"
            elif "roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "RobertaTokenizer"
            elif 'codebert' in pretrained_model_name_or_path.lower():
                if "mlm" in pretrained_model_name_or_path.lower():
                    raise NotImplementedError(
                        "MLM part of codebert is currently not supported in FARM"
                    )
                else:
                    tokenizer_class = "RobertaTokenizer"
            elif "camembert" in pretrained_model_name_or_path.lower(
            ) or "umberto" in pretrained_model_name_or_path:
                tokenizer_class = "CamembertTokenizer"
            elif "distilbert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DistilBertTokenizer"
            elif "bert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "xlnet" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLNetTokenizer"
            elif "electra" in pretrained_model_name_or_path.lower():
                tokenizer_class = "ElectraTokenizer"
            elif "word2vec" in pretrained_model_name_or_path.lower() or \
                    "glove" in pretrained_model_name_or_path.lower() or \
                    "fasttext" in pretrained_model_name_or_path.lower():
                tokenizer_class = "EmbeddingTokenizer"
            elif "minilm" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "dpr-question_encoder" in pretrained_model_name_or_path.lower(
            ):
                tokenizer_class = "DPRQuestionEncoderTokenizer"
            elif "dpr-ctx_encoder" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DPRContextEncoderTokenizer"
            else:
                raise ValueError(
                    f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set "
                    f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, "
                    f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or "
                    f"XLNetTokenizer.")
            logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if tokenizer_class == "AlbertTokenizer":
            if use_fast:
                logger.error(
                    'AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.'
                )
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            if use_fast:
                logger.error(
                    'XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.'
                )
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                logger.error(
                    'RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.'
                )
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            if use_fast:
                logger.error(
                    'XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.'
                )
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error(
                    'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.'
                )
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "CamembertTokenizer":
            if use_fast:
                logger.error(
                    'CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.'
                )
                ret = CamembertTokenizer._from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer._from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast":
                ret = DPRContextEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Ejemplo n.º 13
0
    def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        model config or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)

        if tokenizer_class is None:
            tokenizer_class = cls._infer_tokenizer_class(pretrained_model_name_or_path)

        logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if tokenizer_class == "AlbertTokenizer":
            if use_fast:
                logger.error('AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.')
                ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True,  **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            if use_fast:
                logger.error('XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.')
                ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                logger.error('RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.')
                ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            if use_fast:
                logger.error('XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.')
                ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error('EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.')
                ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "CamembertTokenizer":
            if use_fast:
                logger.error('CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.')
                ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast":
                ret = DPRContextEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Ejemplo n.º 14
0
    def load(cls,
             pretrained_model_name_or_path,
             tokenizer_class=None,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param kwargs:
        :return: Tokenizer
        """

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        # guess tokenizer type from name
        if tokenizer_class is None:
            if "albert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "AlbertTokenizer"
            elif "xlm-roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLMRobertaTokenizer"
            elif "roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "RobertaTokenizer"
            elif 'codebert' in pretrained_model_name_or_path.lower():
                if "mlm" in pretrained_model_name_or_path.lower():
                    raise NotImplementedError(
                        "MLM part of codebert is currently not supported in FARM"
                    )
                else:
                    tokenizer_class = "RobertaTokenizer"
            elif "camembert" in pretrained_model_name_or_path.lower(
            ) or "umberto" in pretrained_model_name_or_path:
                tokenizer_class = "CamembertTokenizer"
            elif "distilbert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DistilBertTokenizer"
            elif "bert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "xlnet" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLNetTokenizer"
            elif "electra" in pretrained_model_name_or_path.lower():
                tokenizer_class = "ElectraTokenizer"
            elif "word2vec" in pretrained_model_name_or_path.lower() or \
                    "glove" in pretrained_model_name_or_path.lower() or \
                    "fasttext" in pretrained_model_name_or_path.lower():
                tokenizer_class = "EmbeddingTokenizer"
            elif "minilm" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            else:
                raise ValueError(
                    f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set "
                    f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, "
                    f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or "
                    f"XLNetTokenizer.")
            logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        if tokenizer_class == "AlbertTokenizer":
            ret = AlbertTokenizer.from_pretrained(
                pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            ret = XLMRobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "RobertaTokenizer":
            ret = RobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DistilBertTokenizer":
            ret = DistilBertTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "BertTokenizer":
            ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                 keep_accents=True,
                                                 **kwargs)
        elif tokenizer_class == "ElectraTokenizer":
            ret = ElectraTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            ret = EmbeddingTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "CamembertTokenizer":
            ret = CamembertTokenizer._from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Ejemplo n.º 15
0
 def get_tokenizer(self, **kwargs):
     return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
Ejemplo n.º 16
0
    def test_full_tokenizer(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [285, 46, 10, 170, 382])

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(
            tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "9",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "é",
                ".",
            ],
        )
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(ids, [
            8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72,
            80, 6, 0, 4
        ])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "<unk>",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "<unk>",
                ".",
            ],
        )
Ejemplo n.º 17
0
    def __init__(self,
                 vocab: Vocabulary,
                 model_name: str,
                 k=12,
                 output_dim=1,
                 freeze_embeddings=False,
                 temperature=1,
                 train_with_regular_softmax=False,
                 use_similarity=False,
                 pass_probabilities_to_classifier=False,
                 use_straight_through_gumbel_softmax=False,
                 anneal_temperature=False,
                 train_generator=True,
                 use_kld_loss=False,
                 generate_until_dot=False,
                 lm_loss_coeff=1,
                 use_cls=False,
                 pass_only_generated=False,
                 sim_coeff=1,
                 dropout=0.1,
                 train_with_just_sim_loss_for_epochs_num=-1,
                 decouple_gen_and_cls_embs=False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 load_weights=False,
                 zero_generated_out=False,
                 output_several_results_on_every_step=False,
                 results_each_step=0,
                 use_repetition_loss=False,
                 sequence_ngram_n=1,
                 rep_coeff=1,
                 use_similarity_btw_question_and_answers=False,
                 anneal_repetition_loss=False,
                 anneal_kld_loss=False,
                 add_cls_after_epoch_num=-1,
                 train_lm_generator=False,
                 gen_lm_loss_coeff=1,
                 train_cls_without_lm_loss=False):
        super(GeneralGenerationForClassfiication, self).__init__(vocab)
        self.gen_model = XLNetLMHeadModel.from_pretrained(model_name,
                                                          dropout=dropout)
        self.tokenizer = XLNetTokenizer.from_pretrained(model_name)
        self.gen_word_embedding = self.gen_model.transformer.word_embedding
        self.gen_embeddings_weight = self.gen_word_embedding.weight

        if use_cls:
            self.cls_model = XLNetModel.from_pretrained(model_name)
            self.cls_word_embedding = self.cls_model.word_embedding
            self.cls_embeddings_weight = self.cls_word_embedding.weight
        if use_kld_loss:
            self.freezed_lm = XLNetLMHeadModel.from_pretrained(model_name)
            self.freezed_lm.requires_grad_(False)

        n_embd = 768 if 'base' in model_name else 1024
        self.cls = nn.Linear(n_embd, output_dim, bias=True)
        self.use_cls = use_cls
        self.use_similarity = use_similarity
        self.train_generator = train_generator
        self.dropout = nn.Dropout(dropout)
        self.k = k

        self.use_kld_loss = use_kld_loss
        self.lm_loss_coeff = lm_loss_coeff
        self.anneal_kld_loss = anneal_kld_loss
        self.sim_coeff = sim_coeff
        self.use_repetition_loss = use_repetition_loss
        self.rep_coeff = rep_coeff
        self.anneal_repetition_loss = anneal_repetition_loss
        self.sequence_ngram_n = sequence_ngram_n

        if freeze_embeddings:
            self.gen_embeddings_weight.requires_grad = False
            self.gen_word_embedding.requries_grad_(False)

        if not train_generator:
            self.gen_model.requires_grad_(False)
            self.gen_embeddings_weight.requires_grad = False
            generate_until_dot = True

        self.temperature = temperature
        self.train_with_regular_softmax = train_with_regular_softmax
        self.use_straight_through_gumbel_softmax = use_straight_through_gumbel_softmax
        self.anneal_temperature = anneal_temperature
        self.topk_gs = output_several_results_on_every_step
        self.results_each_step = results_each_step

        self.generate_until_dot = generate_until_dot
        self.pass_only_generated = pass_only_generated

        self.train_with_just_sim_loss_for_epochs_num = train_with_just_sim_loss_for_epochs_num
        self.add_cls_after_epoch_num = add_cls_after_epoch_num
        self.use_similarity_btw_question_and_answers = use_similarity_btw_question_and_answers
        self.decouple_gen_and_cls_embs = decouple_gen_and_cls_embs
        self.pass_probabilities_to_classifier = pass_probabilities_to_classifier
        self.zero_generated_out = zero_generated_out
        self.supervised_generator = train_lm_generator
        self.gen_lm_loss_coeff = gen_lm_loss_coeff
        self.train_cls_without_sup_gen = train_cls_without_lm_loss

        if load_weights:
            initializer(self)

        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "sim_accuracy": CategoricalAccuracy(),
            "kld_loss": Average(),
            "repetition_loss": Average(),
            "classification_loss": Average(),
            "similarity_loss": Average(),
        }