def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained('gpt2-medium') model = GPT2LMHeadModel.from_pretrained( 'gpt2-medium') #gpt2 as other option model.eval() model.to(device) # context is the universal trigger trigger = "TH PEOPLEMan goddreams Blacks" trigger_tokens = enc.encode(trigger) # get samples from the model for _ in range(100): out = sample_sequence(model=model, length=200, context=trigger_tokens, temperature=1.5, top_k=5) out = out[:, len(trigger_tokens):].tolist() for i in range(1): text = enc.decode(out[i]) if text.find('<|endoftext|>') > 0: text = text[0:text.find('<|endoftext|>')] print("Prompt: " + trigger) print("Output: " + text) print("=" * 80)
def load(self): try: self._tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") self._model = GPT2LMHeadModel.from_pretrained("gpt2-medium") except: self._model = None return self
def generate_samples(args): """Use a pre-trained GPT-2 model to generate a set of samples from scratch.""" # Set seed set_random_seeds(args.random_seed) # Initialize training device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print('Device: {}'.format(str(device))) # Load pre-trained network weights print('Loading pre-trained model...') config = GPT2Config.from_pretrained(args.gpt2_version) model = GPT2LMHeadModel(config) model.load_state_dict(torch.load(args.model_load_path)) model = model.to(device) model.eval() # Create tokenizer tokenizer = GPT2Tokenizer.from_pretrained(args.gpt2_version) # Generate some samples print('Generating...') generated = generate_sequence(model, tokenizer, context=args.context, max_length=args.max_gen_len, num_samples=args.num_samples, top_k=args.sampling_top_k, device=device) print('Generated samples:') print(*generated, sep="\n---\n")
def sample_sequence(cfg, model: JointSentiGPT2Model, tokenizer: GPT2Tokenizer, context_token: torch.Tensor, token_type: torch.Tensor, context_emotion: torch.Tensor, cls_mask: torch.Tensor, emotion_pad=0, speaker1_state=2, decoding_strategy='sampling'): cls_mask_extra = torch.LongTensor([[[1], [0], [0], [0]]]).to(cfg.device) context_len = context_token.shape[1] generated = context_token past, pred_response_emotion = None, None result = [] for step in range(cfg.max_decode_length): inputs = { 'input_ids': generated, 'token_type_ids': token_type, 'emotion_ids': context_emotion, 'pred_response_emotion_vector': pred_response_emotion, 'cls_mask': cls_mask, 'past': past, 'decoding': True } outputs = model.decoding( **inputs ) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states) pred_response_emotion, past = outputs[1:] next_token_logits = outputs[0][0, -1, :] / cfg.sampling_temperature if decoding_strategy == 'sampling': filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=cfg.top_k, top_p=cfg.top_p) prob = F.softmax(filtered_logits, dim=-1) next_token = torch.multinomial(prob, num_samples=1) else: next_token = torch.argmax(next_token_logits, dim=-1) next_token = next_token.unsqueeze(0) if next_token.item( ) == tokenizer.eos_token_id and step >= cfg_gpt.min_decode_length: break result.append(next_token.item()) generated = next_token.unsqueeze(0) token_type = torch.LongTensor([[speaker1_state]]).to(cfg.device) cls_mask = torch.cat((cls_mask, cls_mask_extra), dim=-1) # generated = generated[0, context_len:].tolist() result = [ token_id for token_id in result if token_id not in cfg.special_id_list ] text = tokenizer.decode(result, skip_special_tokens=True, clean_up_tokenization_spaces=False) text = text.replace("\n", "").replace("\r", "") return text
def get_tokenizer(model_path=None, name="bert"): tokenizer = None if name == "bert": from pytorch_transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained(model_path) tokenizer = Seq2SeqAdapterTokenizer(tokenizer) if name == "gpt2": from pytorch_transformers import GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained(model_path) tokenizer = Seq2SeqAdapterTokenizer(tokenizer) if name == "xlnet": from pytorch_transformers import XLNetTokenizer tokenizer = XLNetTokenizer.from_pretrained(model_path) tokenizer = Seq2SeqAdapterTokenizer(tokenizer) if name == "roberta": tokenizer = RoBertaTokenizer(model_path) if name == "simple": tokenizer = SimpleTokenizer() if name == "spacy": tokenizer = SpacyTokenizer() if name == "corenlp": tokenizer = CoreNLPTokenizer() if tokenizer is None: raise RuntimeError("tokenizer:{} is not supported!".format(name)) return tokenizer
def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def __init__( self, pretrained_model_name_or_path: str = "gpt2-medium", layers: str = "1", pooling_operation: str = "first_last", use_scalar_mix: bool = False, ): """OpenAI GPT-2 embeddings, as proposed in Radford et al. 2019. :param pretrained_model_name_or_path: name or path of OpenAI GPT-2 model :param layers: comma-separated list of layers :param pooling_operation: defines pooling operation for subwords :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) """ super().__init__() self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path) self.model = GPT2Model.from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path, output_hidden_states=True ) self.name = pretrained_model_name_or_path self.layers: List[int] = [int(layer) for layer in layers.split(",")] self.pooling_operation = pooling_operation self.use_scalar_mix = use_scalar_mix self.static_embeddings = True dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token("hello")) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len( embedded_dummy[0].get_token(1).get_embedding() )
def __init__(self, model_path='gpt2', top_k=None, top_p=None, device=None): super().__init__(device, top_k=top_k, top_p=top_p) self.model_path = model_path self.tokenizer = GPT2Tokenizer.from_pretrained(model_path) self.model = GPT2LMHeadModel.from_pretrained(model_path) self.model.to(device) self.model.eval()
def __init__(self): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # TODO maybe smaller gpt2 model separately self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') self.model = GPT2LMHeadModel.from_pretrained('gpt2-medium') self.model.to(self.device) self.model.eval()
def get_special_token_ids(cfg, tokenizer: GPT2Tokenizer): special_id_list = [] for key, value in cfg.SPECIAL_tokens.items(): if key == 'additional_special_tokens': special_id_list.extend(value) else: special_id_list.append(value) return tokenizer.convert_tokens_to_ids(special_id_list)
def __init__(self, **kwargs): self.beam_width = kwargs['beam_width'] self.beam_depth = kwargs['beam_depth'] self.timeout = kwargs['timeout'] random.seed = kwargs['seed'] self.model = GPT2LMHeadModel.from_pretrained('gpt2') self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
def __init__(self, model_path='gpt2', device='cuda'): super().__init__() self.model_path = model_path self.device = device self.tokenizer = GPT2Tokenizer.from_pretrained(model_path) self.model = GPT2LMHeadModel.from_pretrained(model_path) self.model.to(device) self.model.eval()
def __init__(self): super(GPT2, self).__init__() self.model_type = "GPT2" # Load pre-trained model tokenizer (vocabulary) self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # Load pre-trained model (weights) self.model = GPT2LMHeadModel.from_pretrained("gpt2")
def gpt2(self, prep_obj): self.vector_corpus = [] model = GPT2LMHeadModel.from_pretrained('gpt2') token_maker = GPT2Tokenizer.from_pretrained('gpt2') for tweet in prep_obj.detokenized_corpus: text_index = token_maker.encode(tweet) vector = (model.transformer.wte.weight[text_index, :]) vector = vector.detach().numpy() vector = np.sum(vector, axis=0) self.vector_corpus.append(vector)
def __init__(self, chunck_size=64, max_length=35, device=torch.device('cuda:0')): super(GPT2Client, self).__init__() self.chunck_size = chunck_size self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.max_length = max_length # load the model self.model = GPT2Model.from_pretrained('gpt2') self.model.eval() self.device = device # move model to device self.model.to(self.device)
def add_pytorch_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name if tokenizer_name.startswith("bert-"): tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token if tokenizer_name.startswith("roberta-"): if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None: vocab_size -= 1 else: log.info("Time to delete vocab_size-1 in preprocess.py !!!") # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove # this when they fix the problem ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added pytorch_transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def __init__(self, max_size=None, vocab_file=None): from pytorch_transformers import GPT2Tokenizer self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.EOT = self.tokenizer.encoder['<|endoftext|>'] self.max_size = max_size self.vocab_file = vocab_file pad = 8 vocab_size = len(self.tokenizer) padded_vocab_size = (vocab_size + pad - 1) // pad * pad for i in range(0, padded_vocab_size - vocab_size): token = f'madeupword{i:09d}' self.tokenizer.add_tokens([token])
def __init__(self): if not os.path.exists(AGGREGATOR_DIR): os.makedirs(AGGREGATOR_DIR) if not os.path.isfile(AGGREGATOR_2015_2016): print("Downloading aggregators from s3...") wget.download(AGGREGATOR_2015_2016_URL, AGGREGATOR_2015_2016, bar=self._download_progress_bar) if not os.path.isfile(AGGREGATOR_2015_2017): print("Downloading aggregators from s3...") wget.download(AGGREGATOR_2015_2017_URL, AGGREGATOR_2015_2017, bar=self._download_progress_bar) if not os.path.isfile(AGGREGATOR_2015_2016_8_dim): print("Downloading aggregators from s3...") wget.download(AGGREGATOR_2015_2016_8_dim_URL, AGGREGATOR_2015_2016_8_dim, bar=self._download_progress_bar) if not os.path.isfile(AGGREGATOR_2015_2017_8_dim): print("Downloading aggregators from s3...") wget.download(AGGREGATOR_2015_2017_8_dim_URL, AGGREGATOR_2015_2017_8_dim, bar=self._download_progress_bar) if not os.path.isfile(ROBERTA_STS_PATH + '/checkpoint_best.pt'): print("Downloading ROBERTA STS model from s3...") wget.download(ROBERTA_STS_URL, ROBERTA_STS_PATH + '/checkpoint_best.pt', bar=self._download_progress_bar) if not os.path.isfile(ROBERTA_MNLI_PATH + '/model_mnli.pt'): print("Downloading ROBERTA MNLI model from s3...") wget.download(ROBERTA_MNLI_URL, ROBERTA_MNLI_PATH + '/model_mnli.pt', bar=self._download_progress_bar) self.roberta_STS = RobertaModel.from_pretrained( checkpoint_file='checkpoint_best.pt', model_name_or_path=ROBERTA_STS_PATH) self.roberta_STS.eval() self.roberta_MNLI = RobertaModel.from_pretrained( checkpoint_file='model_mnli.pt', model_name_or_path=ROBERTA_MNLI_PATH) self.roberta_MNLI.eval() self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.gpt_model = GPT2LMHeadModel.from_pretrained('gpt2') self.agg_one = load(AGGREGATOR_2015_2016) self.agg_two = load(AGGREGATOR_2015_2017) self.agg_one_8_dim = load(AGGREGATOR_2015_2016_8_dim) self.agg_two_8_dim = load(AGGREGATOR_2015_2017_8_dim)
def __init__(self, opt, shared=None): super(TransformerAgent, self).__init__(opt, shared) args = AttrDict( opt) # to keep most commands identical to the interact.py script self.args = args logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__file__) self.logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if shared is None: self.logger.info("Get pretrained model and tokenizer") if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() if 'gpt2' in args.model_checkpoint: self.tokenizer = GPT2Tokenizer.from_pretrained( args.model_checkpoint) model_class = GPT2DoubleHeadsModel if self.args.eval_type == "hits@1" else GPT2LMHeadModel else: self.tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_checkpoint) model_class = OpenAIGPTDoubleHeadsModel if self.args.eval_type == "hits@1" else OpenAIGPTLMHeadModel self.model_checkpoint = model_class.from_pretrained( args.model_checkpoint) self.model_checkpoint.to(args.device) self.logger.info("Build BPE prefix dictionary") convai_dict = build_dict() assert len(convai_dict) == 19304 self.prefix2words = self.get_prefix2words(convai_dict) else: self.model_checkpoint = shared['model'] self.tokenizer = shared['tokenizer'] self.prefix2words = shared['prefix2words'] add_special_tokens_(self.model_checkpoint, self.tokenizer) self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS) self.persona = [] self.history = [] self.labels = [] self.reset()
def gpt_predictor(n=3): if request.method == 'GET': return render_template('index.html', value='hi') if request.method == 'POST': tok = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained("gpt2") text = request.form.get('text') n = request.form.get('n') for i in range(int(n)): pred = get_pred(text, model, tok) if pred == "<|endoftext|>": break else: text += pred return render_template('result.html', text=text)
def __init__(self, type, file_prefix=None): self.type = type self.bos_token = None self.eos_token = None self.unk_token = None self.sep_token = None self.pad_token = None self.cls_token = None self.mask_token = None if type == "gpt2": from pytorch_transformers import GPT2Tokenizer self._tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # add <PAD> special token self._tokenizer.add_special_tokens({'pad_token': '<PAD>'}) for i in range(len(self._tokenizer)): token = self._tokenizer.convert_ids_to_tokens(i) if self._tokenizer._bos_token: # using _xxx_token instead of xxx_token to silence gpt2tokenizer not set errors self.bos_token = self._tokenizer.bos_token if self._tokenizer._eos_token: self.eos_token = self._tokenizer.eos_token if self._tokenizer._unk_token: self.unk_token = self._tokenizer.unk_token if self._tokenizer._sep_token: self.sep_token = self._tokenizer.sep_token if self._tokenizer._pad_token: self.pad_token = self._tokenizer.pad_token if self._tokenizer._cls_token: self.cls_token = self._tokenizer.cls_token if self._tokenizer._mask_token: self.mask_token = self._tokenizer.mask_token if type == "bpe": self.bpe_vocab_size = 0 self.bos_token = "<BOS>" self.eos_token = "<EOS>" self.unk_token = "<UNK>" self.sep_token = "<SEP>" self.pad_token = "<PAD>" self.cls_token = "<CLS>" self.mask_token = "<MASK>" self._recreate_special_tokens() if file_prefix: self.load(file_prefix)
def get_model(seed=1234, model_name='gpt2'): np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") enc = GPT2Tokenizer.from_pretrained(model_name) enc.unk_token = None enc.bos_token = None enc.eos_token = None model = GPT2LMHeadModel.from_pretrained(model_name) model.to(device) model.eval() #model.double() return enc, model
def predict_next_word(phrase): """ Function to process the phrase using GPT-2 :param phrase: :return: """ # Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Tokenize the input phrase tokenized_phrase = tokenizer.encode(phrase) print("Tokenized Phrase: {}".format(tokenized_phrase)) # Convert tokenized phrase to pytorch tensor tokenized_phrase_tensor = torch.tensor([tokenized_phrase]) print("Tokenized Phrase Tensor: {}".format(tokenized_phrase_tensor)) # Load pretrainied model. This will have weights and bias model = GPT2LMHeadModel.from_pretrained('gpt2') # Set the model in evaluation mode to deactivate drop-out. (Back-prop) model.eval() try: tokenized_phrase_tensor = tokenized_phrase_tensor.to('cuda') model.to('cuda') print("CUDA present.Running code on GPU") except AssertionError: print("Torch not compiled with CUDA. Running on CPU.") except Exception: print("CUDA not present. Running on CPU") # Predict all tokens with torch.no_grad(): outputs = model(tokenized_phrase_tensor) print("Outputs: {}".format(outputs)) predictions = outputs[0] print("Prediction: {}".format(predictions)) # Get the predicted next sub-word predicted_index = torch.argmax(predictions[0, -1, :]).item() predicted_text = tokenizer.decode(tokenized_phrase + [predicted_index]) return predicted_text
def main(): nltk.data.path.append('/data/chuancen/pip_package/nltk_data') print(nltk.__version__) file_handler = open('../../result/reference_SR_only.txt', 'r') ref = file_handler.readlines() file_handler = open('../../result/SR_only.txt', 'r') hyp = file_handler.readlines() print("#ref{} #hyp{}".format(len(ref), len(hyp))) meteor_sum = 0 for i in range(min(len(ref), len(hyp))): meteor_sum += meteor_score([ref[i]], hyp[i]) meteor_sum /= min(len(ref), len(hyp)) print(meteor_sum) tokenizer = GPT2Tokenizer.from_pretrained( '/data/chuancen/LIT/models/345M_Alex')
def load_model(args): """ Load model and the corresponding tokenizer from pre-trained weight. :param args: The command line arguments. :return model: The main model. :return tokenzier: The tokenzier comes with the main model. """ USE_CUDA = torch.cuda.is_available() # ====== Load GPT2 model ======== model_dir = '../models/' + args.model_dir model = GPT2LMHeadModel.from_pretrained(model_dir) # model = GPT2LMHeadModel.from_pretrained('gpt2') if USE_CUDA: model.cuda() tokenizer = GPT2Tokenizer.from_pretrained(model_dir) # tokenizer = GPT2Tokenizer.from_pretrained('gpt2') print('Model loaded.') return model, tokenizer
def evaluate_ppl_gpt(args): """ Evaluate on raw text, use this with GPT which has its own tokenizer """ if args.expanded_dataset: path = ".data/stories/story_commonsense/torchtext_expanded" else: path = ".data/stories/story_commonsense/torchtext" # Data test_src = [line.rstrip('\n') for line in open(path + "/test.src")] test_trg = [line.rstrip('\n') for line in open(path + "/test.trg")] # Model enc = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') model.to(device) model.eval() loss = 0 batch_size = 1 print("Evaluating test set with GPT2") for i in trange(len(test_src)): src, trg = test_src[i], test_trg[i] context = enc.encode(src) target = enc.encode(trg) length = len(target) # Generate prediction out = utils.sample_sequence(model, length, batch_size=1, context=context, top_k=10, device=device) out = out[:, len(context):] # Get model loss target = torch.tensor([target]).to(device) with torch.no_grad(): #pred, past = model(out) l = model(out, labels=target) loss += float(l) av_loss = loss / len(loss) print(f"ppl: {math.exp(av_loss):.04f}")
def __init__(self, gpt2_model, language, name, loi, cuda=False): super(GPT2, self).__init__() # Load pre-trained model tokenizer (vocabulary) # Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization. if gpt2_model not in ['small', 'medium']: raise ValueError("GPT2 model must be small or medium") self.model = GPT2Model.from_pretrained( 'gpt2{}'.format('' if gpt2_model == 'small' else '-medium'), output_hidden_states=True) self.tokenizer = GPT2Tokenizer.from_pretrained( 'gpt2{}'.format('' if gpt2_model == 'small' else '-medium')) self.language = language self.LAYER_COUNT = parameters[gpt2_model]['LAYER_COUNT'] self.FEATURE_COUNT = parameters[gpt2_model]['FEATURE_COUNT'] self.name = name self.loi = np.array(loi) if loi else np.arange( parameters[gpt2_model]['LAYER_COUNT']) # loi: layers of interest self.cuda = cuda
def gpt_predictor(request, n=3): tok = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained("gpt2") if request.method == 'GET': return "Welcome to GPT predictor" if request.method == 'POST': data = request.get_json() text = data["text"] res = [] n = data["n"] for i in range(n): pred = get_pred(text, model, tok) if pred == "<|endoftext|>": break else: text += pred return text
def test_special_tokens_checkpoint_behavior(self): toks = [ OpenAIGPTTokenizer.from_pretrained('openai-gpt'), GPT2Tokenizer.from_pretrained('gpt2') ] for tok in toks: self.assertEqual(len(tok.added_tokens_encoder), 0) tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) self.assertEqual(len(tok.added_tokens_encoder), 5) # Make sure we never split self.assertEqual(len(tok.tokenize("<bos> <speaker1>")), 2) ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS) self.assertTrue( all([x > 0 for x in ids]), f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}') # Need to mantain indices through save. (this is also tested in pytorch-transformers) tok.save_pretrained(self.save_dir) tok_loaded = tok.from_pretrained(str(self.save_dir)) ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS) self.assertListEqual(ids, ids2)
def get_textgen(sentence: str) -> str: """ Runs text_generation GPT2 model and returns generated text. :param sentence: sentence taken from serializer.data. :return: Generated text. """ output_dir = './models/text_gen' tokenizer = GPT2Tokenizer.from_pretrained(output_dir) model = GPT2LMHeadModel.from_pretrained(output_dir) tokens = tokenizer.encode(sentence) tokens_tensor = torch.tensor([tokens]) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokens_tensor = tokens_tensor.to(device) model.to(device) with torch.no_grad(): outputs = model(tokens_tensor) predictions = outputs[0] predicted_index = torch.argmax(predictions[0, -1, :]).item() predicted_text = tokenizer.decode(tokens + [predicted_index]) return predicted_text