def test_tokenizer_from_pretrained(self): cache_dir = "/tmp/pytorch_pretrained_bert_test/" for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]: tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(tokenizer)
def gpt2Tokenizer(*args, **kwargs): """ Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file. Peculiarities: - Byte-level BPE Args: pretrained_model_name_or_path: Path to pretrained model archive or one of pre-trained vocab configs below. * gpt2 Keyword args: special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...) Default: None max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. Default: None Example: >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2') >>> text = "Who was Jim Henson ?" >>> indexed_tokens = tokenizer.encode(tokenized_text) """ tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs) return tokenizer
def test_full_tokenizer(self): """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ vocab = [ "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "low", "er", "low", "lowest", "newer", "wider" ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "l o", "lo w", "e r", ""] with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp: fp.write(json.dumps(vocab_tokens)) vocab_file = fp.name with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp: fp.write("\n".join(merges)) merges_file = fp.name tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"]) print("encoder", tokenizer.byte_encoder) os.remove(vocab_file) os.remove(merges_file) text = "lower" bpe_tokens = ["low", "er"] tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + ["<unk>"] input_bpe_tokens = [13, 12, 16] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) vocab_file, merges_file, special_tokens_file = tokenizer.save_vocabulary( vocab_path="/tmp/") tokenizer_2 = GPT2Tokenizer.from_pretrained("/tmp/") os.remove(vocab_file) os.remove(merges_file) os.remove(special_tokens_file) self.assertListEqual([ tokenizer.encoder, tokenizer.decoder, tokenizer.bpe_ranks, tokenizer.special_tokens, tokenizer.special_tokens_decoder ], [ tokenizer_2.encoder, tokenizer_2.decoder, tokenizer_2.bpe_ranks, tokenizer_2.special_tokens, tokenizer_2.special_tokens_decoder ])
def __init__(self, cache_size: int = 100) -> None: """ Each cache element is about 8MB, so size accordingly. """ # Cache stores tuples, so default value is a tuple self._cache = LRUCache(cache_size, default_value=(None, None)) self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.model = GPT2LMHeadModel.from_pretrained('gpt2') # The end of text marker. self.END_OF_TEXT = self.tokenizer.encoder["<|endoftext|>"]
def __init__(self, model_name: str = MEDIUM_MODEL, cache_size: int = 0) -> None: """ Each cache element is about 8MB, so size accordingly. """ # Cache stores tuples, so default value is a tuple self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.model = GPT2LMHeadModel.from_pretrained(model_name) # The end of text marker. self.END_OF_TEXT = self.tokenizer.encoder["<|endoftext|>"]
def __init__(self, model_name: str = MEDIUM_MODEL, cache_size: int = 0) -> None: """ Each cache element is about 8MB, so size accordingly. """ # Cache stores tuples, so default value is a tuple # self.tokenizer = T5Tokenizer.from_pretrained(MEDIUM_MODEL) # self.model = AutoModelForCausalLM.from_pretrained(MEDIUM_MODEL) self.tokenizer = GPT2Tokenizer.from_pretrained(MEDIUM_MODEL) self.model = GPT2LMHeadModel.from_pretrained(MEDIUM_MODEL)
def __init__(self, model_name: str = '117M', cache_size: int = 0) -> None: """ Each cache element is about 8MB, so size accordingly. """ # Cache stores tuples, so default value is a tuple self._cache = LRUCache(cache_size, default_value=(None, None)) self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') if model_name == '117M': self.model = GPT2LMHeadModel.from_pretrained('gpt2') elif model_name == '345M': self.model = GPT2LMHeadModel.from_pretrained(MEDIUM_MODEL) else: exit("model name not found") # The end of text marker. self.END_OF_TEXT = self.tokenizer.encoder["<|endoftext|>"]
def predict(): text_return = '' if request.method == 'POST': post_text = request.form.to_dict() for k in post_text: print(k) # Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Encode text input text = k indexed_tokens = tokenizer.encode(text) # Convert indexed tokens in a PyTorch tensor tokens_tensor = torch.tensor([indexed_tokens]) # Load pre-trained model (weights) model = GPT2LMHeadModel.from_pretrained( 'https://storage.googleapis.com/allennlp/models/gpt2-345M-dump' ) #print(1) # Set the model in evaluation mode to deactivate the DropOut modules model.eval() # If you have a GPU, put everything on cuda tokens_tensor = tokens_tensor.to('cpu') model.to('cpu') # Predict all tokens with torch.no_grad(): outputs = model(tokens_tensor) predictions = outputs[0] # Get the predicted next sub-word predicted_index = torch.argmax(predictions[0, -1, :]).item() predicted_text = tokenizer.decode(indexed_tokens + [predicted_index]) # Print the predicted word #print(predicted_text) #debugging original_fragment = text completed_phrase = predicted_text output_list = [ li for li in difflib.ndiff(original_fragment, completed_phrase) if li[0] != ' ' ] #print(output_list) #debugging output_list = [s.replace('+', '') for s in output_list[1:]] #print(output_list) #debugging output_word = "" for x in output_list: output_word += x #print(output_word) #output_word = output_word.strip(" ") #print(output_word) output_word = output_word.replace(" ", "") print(output_word) text_return = output_word return text_return