def test_tokenizer_no_lower(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "se", ".", ], )
def test_tokenizer_no_lower(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False) tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from `pretrained_model_name_or_path` or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) # guess tokenizer type from name if tokenizer_class is None: if "albert" in pretrained_model_name_or_path.lower(): tokenizer_class = "AlbertTokenizer" elif "xlm-roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLMRobertaTokenizer" elif "roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "RobertaTokenizer" elif "distilbert" in pretrained_model_name_or_path.lower(): tokenizer_class = "DistilBertTokenizer" elif "bert" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "xlnet" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLNetTokenizer" else: raise ValueError( f"Could not infer tokenizer_type from name '{pretrained_model_name_or_path}'. Set arg `tokenizer_type` in Tokenizer.load() to one of: 'bert', 'roberta', 'xlnet' " ) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object if tokenizer_class == "AlbertTokenizer": ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "RobertaTokenizer": ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DistilBertTokenizer": ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "BertTokenizer": ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def xlnetTokenizer(*args, **kwargs): """ Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file. Peculiarities: - require Google sentencepiece (https://github.com/google/sentencepiece) Args: pretrained_model_name_or_path: Path to pretrained model archive or one of pre-trained vocab configs below. * xlnet-large-cased Keyword args: special_tokens: Special tokens in vocabulary that are not pretrained Default: None max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying model's sequence length. Default: None Example: >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased') >>> text = "Who was Jim Henson ?" >>> indexed_tokens = tokenizer.encode(tokenized_text) """ tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs) return tokenizer
def test_full_tokenizer(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize(u'This is a test') self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est']) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") self.assertListEqual(tokens, [ SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.' ]) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual(ids, [ 8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4 ]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual(back_tokens, [ SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'<unk>', u'.' ])
def test_sequence_builders(self): tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") text = tokenizer.encode("sequence builders", add_special_tokens=False) text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) assert encoded_sentence == text + [4, 3] assert encoded_pair == text + [4] + text_2 + [4, 3]
def test_sequence_builders(self): tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) assert encoded_sentence == text + [4, 3] assert encoded_pair == text + [4] + text_2 + [4, 3]
def setUp(self): super().setUp() # We have a SentencePiece fixture for testing tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer.sanitize_special_tokens() tokenizer.save_pretrained(self.tmpdirname)
def test_full_tokenizer(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) with TemporaryDirectory() as tmpdirname: tokenizer.save_pretrained(tmpdirname) input_text = u"This is a test" output_text = u"This is a test" create_and_check_tokenizer_commons(self, input_text, output_text, XLNetTokenizer, tmpdirname) tokens = tokenizer.tokenize(u'This is a test') self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est']) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) tokens = tokenizer.tokenize( u"I was born in 92000, and this is falsé.") self.assertListEqual(tokens, [ SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.' ]) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual(ids, [ 8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4 ]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual(back_tokens, [ SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'<unk>', u'.' ])
def setUp(self): super(XLNetTokenizationTest, self).setUp() # We have a SentencePiece fixture for testing tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer.save_pretrained(self.tmpdirname)
def read_instance(input_file, word_alphabet, biword_alphabet, label_alphabet, number_normalized, max_sent_length, bertpath): tokenizer = BertTokenizer.from_pretrained(bertpath, do_lower_case=True) xlnet_tokenizer = XLNetTokenizer.from_pretrained( 'transformer_cpt/chinese_xlnet_base_pytorch/', add_special_tokens=False) in_lines = open(input_file, 'r', encoding="utf-8").readlines() instence_texts = [] instence_Ids = [] word_Ids = [] biword_Ids = [] label_Ids = [] words = [] biwords = [] labels = [] for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split('\t') if len(pairs) == 1: word = ' ' # print('word == ') else: word = pairs[0] if number_normalized: word = normalize_word(word) label = pairs[-1] if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[idx + 1].strip().split()[0] # todo biword = normalize_word(biword) else: biword = word + NULLKEY biwords.append(biword) words.append(word.lower()) labels.append(label) word_Ids.append(word_alphabet.get_index(word.lower())) biword_index = biword_alphabet.get_index(biword.lower()) biword_Ids.append(biword_index) label_Ids.append(label_alphabet.get_index(label)) else: # todo 这里直接截断了,做医疗相关不能这么干 if len(words) <= 0: raise ValueError('len(words) <= 0') texts = ['[CLS]'] + words[:max_sent_length] + ['[SEP]'] bert_text_ids = tokenizer.convert_tokens_to_ids(texts) xlnet_text_ids = xlnet_tokenizer.convert_tokens_to_ids( words[:max_sent_length]) instence_texts.append([words, biwords, labels]) word_Ids = word_Ids[:max_sent_length] biword_Ids = biword_Ids[:max_sent_length] label_Ids = label_Ids[:max_sent_length] assert len(texts) - 2 == len(word_Ids) instence_Ids.append([ word_Ids, biword_Ids, label_Ids, bert_text_ids, xlnet_text_ids ]) words = [] biwords = [] labels = [] word_Ids = [] biword_Ids = [] label_Ids = [] return instence_texts, instence_Ids
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from `pretrained_model_name_or_path` or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) # guess tokenizer type from name if tokenizer_class is None: if "albert" in pretrained_model_name_or_path.lower(): tokenizer_class = "AlbertTokenizer" elif "xlm-roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLMRobertaTokenizer" elif "roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "RobertaTokenizer" elif 'codebert' in pretrained_model_name_or_path.lower(): if "mlm" in pretrained_model_name_or_path.lower(): raise NotImplementedError( "MLM part of codebert is currently not supported in FARM" ) else: tokenizer_class = "RobertaTokenizer" elif "camembert" in pretrained_model_name_or_path.lower( ) or "umberto" in pretrained_model_name_or_path: tokenizer_class = "CamembertTokenizer" elif "distilbert" in pretrained_model_name_or_path.lower(): tokenizer_class = "DistilBertTokenizer" elif "bert" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "xlnet" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLNetTokenizer" elif "electra" in pretrained_model_name_or_path.lower(): tokenizer_class = "ElectraTokenizer" elif "word2vec" in pretrained_model_name_or_path.lower() or \ "glove" in pretrained_model_name_or_path.lower() or \ "fasttext" in pretrained_model_name_or_path.lower(): tokenizer_class = "EmbeddingTokenizer" elif "minilm" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "dpr-question_encoder" in pretrained_model_name_or_path.lower( ): tokenizer_class = "DPRQuestionEncoderTokenizer" elif "dpr-ctx_encoder" in pretrained_model_name_or_path.lower(): tokenizer_class = "DPRContextEncoderTokenizer" else: raise ValueError( f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set " f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, " f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or " f"XLNetTokenizer.") logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if tokenizer_class == "AlbertTokenizer": if use_fast: logger.error( 'AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.' ) ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": if use_fast: logger.error( 'XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.' ) ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: logger.error( 'RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.' ) ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = DistilBertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = BertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": if use_fast: logger.error( 'XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.' ) ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = ElectraTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error( 'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.' ) ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": if use_fast: logger.error( 'CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.' ) ret = CamembertTokenizer._from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer._from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast": ret = DPRQuestionEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast": ret = DPRContextEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from model config or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) if tokenizer_class is None: tokenizer_class = cls._infer_tokenizer_class(pretrained_model_name_or_path) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if tokenizer_class == "AlbertTokenizer": if use_fast: logger.error('AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.') ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": if use_fast: logger.error('XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.') ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: logger.error('RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.') ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": if use_fast: logger.error('XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.') ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error('EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.') ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": if use_fast: logger.error('CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.') ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast": ret = DPRQuestionEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast": ret = DPRContextEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from `pretrained_model_name_or_path` or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) # guess tokenizer type from name if tokenizer_class is None: if "albert" in pretrained_model_name_or_path.lower(): tokenizer_class = "AlbertTokenizer" elif "xlm-roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLMRobertaTokenizer" elif "roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "RobertaTokenizer" elif 'codebert' in pretrained_model_name_or_path.lower(): if "mlm" in pretrained_model_name_or_path.lower(): raise NotImplementedError( "MLM part of codebert is currently not supported in FARM" ) else: tokenizer_class = "RobertaTokenizer" elif "camembert" in pretrained_model_name_or_path.lower( ) or "umberto" in pretrained_model_name_or_path: tokenizer_class = "CamembertTokenizer" elif "distilbert" in pretrained_model_name_or_path.lower(): tokenizer_class = "DistilBertTokenizer" elif "bert" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "xlnet" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLNetTokenizer" elif "electra" in pretrained_model_name_or_path.lower(): tokenizer_class = "ElectraTokenizer" elif "word2vec" in pretrained_model_name_or_path.lower() or \ "glove" in pretrained_model_name_or_path.lower() or \ "fasttext" in pretrained_model_name_or_path.lower(): tokenizer_class = "EmbeddingTokenizer" elif "minilm" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" else: raise ValueError( f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set " f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, " f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or " f"XLNetTokenizer.") logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object if tokenizer_class == "AlbertTokenizer": ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "RobertaTokenizer": ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DistilBertTokenizer": ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "BertTokenizer": ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "ElectraTokenizer": ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": ret = CamembertTokenizer._from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def get_tokenizer(self, **kwargs): return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def test_full_tokenizer(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", ".", ], ) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual(ids, [ 8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4 ]) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", ".", ], )
def __init__(self, vocab: Vocabulary, model_name: str, k=12, output_dim=1, freeze_embeddings=False, temperature=1, train_with_regular_softmax=False, use_similarity=False, pass_probabilities_to_classifier=False, use_straight_through_gumbel_softmax=False, anneal_temperature=False, train_generator=True, use_kld_loss=False, generate_until_dot=False, lm_loss_coeff=1, use_cls=False, pass_only_generated=False, sim_coeff=1, dropout=0.1, train_with_just_sim_loss_for_epochs_num=-1, decouple_gen_and_cls_embs=False, initializer: InitializerApplicator = InitializerApplicator(), load_weights=False, zero_generated_out=False, output_several_results_on_every_step=False, results_each_step=0, use_repetition_loss=False, sequence_ngram_n=1, rep_coeff=1, use_similarity_btw_question_and_answers=False, anneal_repetition_loss=False, anneal_kld_loss=False, add_cls_after_epoch_num=-1, train_lm_generator=False, gen_lm_loss_coeff=1, train_cls_without_lm_loss=False): super(GeneralGenerationForClassfiication, self).__init__(vocab) self.gen_model = XLNetLMHeadModel.from_pretrained(model_name, dropout=dropout) self.tokenizer = XLNetTokenizer.from_pretrained(model_name) self.gen_word_embedding = self.gen_model.transformer.word_embedding self.gen_embeddings_weight = self.gen_word_embedding.weight if use_cls: self.cls_model = XLNetModel.from_pretrained(model_name) self.cls_word_embedding = self.cls_model.word_embedding self.cls_embeddings_weight = self.cls_word_embedding.weight if use_kld_loss: self.freezed_lm = XLNetLMHeadModel.from_pretrained(model_name) self.freezed_lm.requires_grad_(False) n_embd = 768 if 'base' in model_name else 1024 self.cls = nn.Linear(n_embd, output_dim, bias=True) self.use_cls = use_cls self.use_similarity = use_similarity self.train_generator = train_generator self.dropout = nn.Dropout(dropout) self.k = k self.use_kld_loss = use_kld_loss self.lm_loss_coeff = lm_loss_coeff self.anneal_kld_loss = anneal_kld_loss self.sim_coeff = sim_coeff self.use_repetition_loss = use_repetition_loss self.rep_coeff = rep_coeff self.anneal_repetition_loss = anneal_repetition_loss self.sequence_ngram_n = sequence_ngram_n if freeze_embeddings: self.gen_embeddings_weight.requires_grad = False self.gen_word_embedding.requries_grad_(False) if not train_generator: self.gen_model.requires_grad_(False) self.gen_embeddings_weight.requires_grad = False generate_until_dot = True self.temperature = temperature self.train_with_regular_softmax = train_with_regular_softmax self.use_straight_through_gumbel_softmax = use_straight_through_gumbel_softmax self.anneal_temperature = anneal_temperature self.topk_gs = output_several_results_on_every_step self.results_each_step = results_each_step self.generate_until_dot = generate_until_dot self.pass_only_generated = pass_only_generated self.train_with_just_sim_loss_for_epochs_num = train_with_just_sim_loss_for_epochs_num self.add_cls_after_epoch_num = add_cls_after_epoch_num self.use_similarity_btw_question_and_answers = use_similarity_btw_question_and_answers self.decouple_gen_and_cls_embs = decouple_gen_and_cls_embs self.pass_probabilities_to_classifier = pass_probabilities_to_classifier self.zero_generated_out = zero_generated_out self.supervised_generator = train_lm_generator self.gen_lm_loss_coeff = gen_lm_loss_coeff self.train_cls_without_sup_gen = train_cls_without_lm_loss if load_weights: initializer(self) self.metrics = { "accuracy": CategoricalAccuracy(), "sim_accuracy": CategoricalAccuracy(), "kld_loss": Average(), "repetition_loss": Average(), "classification_loss": Average(), "similarity_loss": Average(), }