def test_tokenizer_from_pretrained(self): logging.basicConfig(level=logging.INFO) for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: tokenizer = AutoTokenizer.from_pretrained(model_name) self.assertIsNotNone(tokenizer) self.assertIsInstance(tokenizer, BertTokenizer) self.assertGreater(len(tokenizer), 0) for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: tokenizer = AutoTokenizer.from_pretrained(model_name) self.assertIsNotNone(tokenizer) self.assertIsInstance(tokenizer, GPT2Tokenizer) self.assertGreater(len(tokenizer), 0)
def test_wordpiece_to_token_correct(base_model): t = WordPieceListTransformer(name="wordpiece-to-token", base_model=base_model) tokenizer = AutoTokenizer.from_pretrained(base_model) # Long text sentences = [ "Some strange text sssasd sdafds dfv vc a more strange", "Short sentence", "OneToken", "", ] encoded_ids = [tokenizer.encode(sentence) for sentence in sentences] _, context = t.transform(encoded_ids) t = context["wordpiece_to_token_list"] assert [ (1,), (2,), (3,), (4, 5, 6, 7), (8, 9, 10), (11, 12), (13, 14), (15,), (16,), (17,), ] == t[0] assert [(1,), (2,)] == t[1] assert [(1, 2)] == t[2] assert [] == t[3]
def tokenizer(*args, **kwargs): r""" # Using torch.hub ! import torch tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased') # Download vocabulary from S3 and cache. tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')` """ return AutoTokenizer.from_pretrained(*args, **kwargs)
def __init__( self, base_model=None, max_seq_len=512, do_lower_case=True, do_basic_tokenize=False, num_of_special_tokens=2, **kwargs, ): super().__init__(**kwargs) self.base_model = base_model self.max_length = max_seq_len self.num_of_special_tokens = num_of_special_tokens self._tokenizer = AutoTokenizer.from_pretrained( base_model, do_lower_case=do_lower_case, do_basic_tokenize=do_basic_tokenize)
def __init__(self, base_model=None, add_to_context=True, **kwargs): super().__init__(**kwargs) self.base_model = base_model self._tokenizer = AutoTokenizer.from_pretrained(base_model) self.add_to_context = add_to_context self.func = self.__class__._get_func(base_model)
network = NumericallyAugmentedBertNet(bert_model, hidden_size=bert_model.config.hidden_size, dropout_prob=0.0, use_gcn=args.use_gcn, gcn_steps=args.gcn_steps) if args.cuda: network.cuda() print("Load from pre path {}.".format(args.pre_path)) network.load_state_dict(torch.load(args.pre_path)) print("Load data from {}.".format(args.inf_path)) if args.eng != 0: tokenizer = RobertaTokenizer.from_pretrained(args.roberta_model) else: # import pdb; pdb.set_trace() tokenizer = AutoTokenizer.from_pretrained(args.roberta_model) if args.tag_mspan: inf_iter = TDropBatchGen(args, tokenizer, TDropReader(tokenizer, passage_length_limit=463, question_length_limit=46, is_eng=args.eng) ._read(args.inf_path)) else: inf_iter = DropBatchGen(args, tokenizer, DropReader(tokenizer, passage_length_limit=463, question_length_limit=46)._read(args.inf_path)) print("Start inference...") result = {} network.eval() # myf = open(args.dump_path, 'w', encoding="utf8") # myf.close() # myf = open(args.dump_path, 'a', encoding="utf8") with torch.no_grad(): for batch in tqdm(inf_iter):
import sys from pytorch_transformers import AutoTokenizer # dataset = sys.argv[1] # model_name_or_path = sys.argv[2] # max_len = int(sys.argv[3]) dataset = r'/media/alvinai/Documents/baidu_nlp/data/GermEval/train.txt.tmp' model_name_or_path = 'bert-base-multilingual-cased' max_len = 128 subword_len_counter = 0 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) # max_len -= tokenizer.num_special_tokens_to_add() max_len -= 2 with open(dataset, "rt") as f_p: for line in f_p: line = line.rstrip() if not line: print(line) subword_len_counter = 0 continue token = line.split()[0] current_subwords_len = len(tokenizer.tokenize(token)) # Token contains strange control characters like \x96 or \x95 # Just filter out the complete line if current_subwords_len == 0: continue if (subword_len_counter + current_subwords_len) > max_len:
def __init__(self, vocab_path=None, do_lower_case=None): # self.tokenizer = BertTokenizer(vocab_path,do_lower_case) self.tokenizer = AutoTokenizer.from_pretrained( "bert-base-multilingual-cased")
def __init__(self, model_name_or_path: str) -> None: super().__init__(lazy=False) self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)