def get_answer(text: str, model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer): cntx_token_id, answer_token_id = tokenizer.additional_special_tokens_ids context = tokenizer.encode(text) context = [tokenizer.bos_token_id] + [cntx_token_id ] + context + [answer_token_id] context = torch.LongTensor([context]) ans = model.generate(input_ids=context, max_length=100, temperature=0.7)[0][1:-1] return tokenizer.decode(ans)
def generate_sentences(model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer, top_k: int = 50, top_p: float = 0.95, max_length: int = 512, num_return_sequences: int = 1, prompt_tokens: torch.Tensor = None) -> List[str]: if prompt_tokens is None: prompt_tokens = torch.tensor(random.randint(1, 30000))[None, None] if prompt_tokens.shape[1] > max_length: prompt_tokens = prompt_tokens[:, 0:max_length] sample_outputs = model.generate(input_ids=prompt_tokens, do_sample=True, top_k=50, top_p=0.95, max_length=max_length * 2, num_return_sequences=num_return_sequences) sentence_list = [] for idx, sample_output in enumerate(sample_outputs): input_sentence = tokenizer.decode( prompt_tokens[0], skip_special_tokens=True, ) generated_sentence = tokenizer.decode( sample_output[prompt_tokens.shape[1]:], skip_special_tokens=True, ) sentence = (f"\n\n{' # ' * 32}\n\n" f"\n\n{' # ' * 32}\n\n" f"INPUT:\n\n{input_sentence}" f"\n\n{' # ' * 32}\n\n" f"GENERATION:\n\n{generated_sentence}") sentence_list.append(sentence) return sentence_list
def make_predictions( text: str, tokenizer: GPT2Tokenizer, gpt2: GPT2LMHeadModel, device: torch.device, max_output_length: int = 100, ) -> Sequence[str]: """Make predictions for text using GPT-2. Args: text: Input text. tokenizer: GPT-2 tokenizer. gpt2: GPT-2 model. device: GPT-2 device. max_output_length: Maximum length of generated sequence. Returns: List of predicted strings after the provided text, or an empty list if the input is over 300 tokens long. """ text = unicodedata.normalize("NFKC", text) input_ids = tokenizer.encode(text) input_ids = torch.tensor([input_ids]).to(device) # pylint: disable=not-callable input_id_length = len(input_ids[0]) # Long inputs usually result in useless outputs, so no predictions are acceptable if input_id_length > 300: return [] # Enforce maximum generated length to prevent memory issues max_length = min(input_id_length + max_output_length, 350) with torch.cuda.amp.autocast(): # Run with FP16 sample_outputs = gpt2.generate( input_ids, do_sample=True, max_length=max_length, min_length=2, # We want output that is at least two words temperature=0.8, top_k=50, top_p=0.8, num_return_sequences=40, ) suggestions = [] for output in sample_outputs: decoded_output = result_replace(tokenizer.decode(output[input_id_length:])) suggestions.append(decoded_output) return suggestions
def predict_next_token( words: str, gpt2_model: GPT2LMHeadModel, gpt2_tokenizer: GPT2Tokenizer, top: int = 3 ) -> Tuple[Tuple[str, float], ...]: """ Predict the next token, given a some starting words. :param words: a string of a few words (max tokens: 1023) :param gpt2_model: GPT2LMHeadModel preferably :param gpt2_tokenizer: GPT2Tokenizer :param top: the number of probable tokens to return :return: a tuple of tuples (token, probability) ## OOME on circleci :-( # >>> gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # >>> gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2') # >>> _ = gpt2_model.eval() # >>> predict_next_token('I am looking', gpt2_model, gpt2_tokenizer) # (('forward', 0.3665640652179718), ('for', 0.35346919298171997), ('to', 0.08423731476068497)) """ tokens_tensor = torch.tensor( # pylint: disable=not-callable gpt2_tokenizer.encode(words, add_special_tokens=True) ).unsqueeze( 0 ) # Batch size 1 if tokens_tensor.shape[1] > 1023: LOG.warning( "Too many tokens, should be 1023 or less, found %s", tokens_tensor.shape[1] ) soft = torch.nn.Softmax(dim=1) gpt2_model.eval() with torch.no_grad(): predictions = gpt2_model(tokens_tensor)[0].squeeze(0) predictions = soft(predictions) values, indices = torch.topk( # pylint: disable=no-member predictions[-1, :], top ) id_prob = list(zip(indices, values)) return tuple( [ # type: ignore (gpt2_tokenizer.decode(int(tmp[0])).strip(), float(tmp[1])) for tmp in id_prob ] )
def generate( input_text: str, model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer, max_generation_len: int = 200, max_context_len: int = 256, ): generated_sentence = input_text prompt_tokens = torch.tensor( tokenizer.encode(generated_sentence)).to("cuda").unsqueeze(0) text_form = st.empty() for _ in tqdm(range(max_generation_len)): # NOTE: uncomment this and remove `model.half()` if it fits into the GPU # with torch.cuda.amp.autocast(): # outputs = model(prompt_tokens) context_len = prompt_tokens.shape[1] if context_len > max_context_len: prompt_tokens = prompt_tokens[:, (context_len - max_context_len):] outputs = model(prompt_tokens) last_scores = outputs[0][:, -1, :] probs = torch.softmax(last_scores, dim=-1) predicted_token = predict_token(probs) predicted_token = predicted_token.to("cuda") prompt_tokens = torch.cat([prompt_tokens, predicted_token], dim=1) predicted_word = tokenizer.decode( predicted_token, skip_special_tokens=True, ) generated_sentence += predicted_word text_form.empty() text_form.text(generated_sentence) return generated_sentence
def compute_compression(model, data, context, batch_size, verbose=False, tbw: SummaryWriter = None, tok: trf.GPT2Tokenizer = None, skip=0): """ Compute the _compression_ of a dataset under a model. That is, given a model, in how many bits could we represent the dataset. This requires us to turn a given probability distribution into a code for the outcomes. See [this video](https://youtu.be/mSneVjDvzNQ) for an explanation. :param model: A sequence-to-sequence model that takes as input a (sub) sequence of integers and produces a probability distributuion on the output. :param data: A singe list of integers representing the data :return: The result of the computation in "bits per byte". That is, how many bits does the compressed representation spend on each byte (=ASCII character) of the raw data. """ bits, tot = 0.0, 0 batch = [] # Buffer, every time it fills up, we run it through the model # --- For the sake of speed we want to process the data in batches. For each token in the data, we make a # prediction based on all the `context` tokens before it. This means that for each subsequence in the batch, we # need to shift the start/end indices ahead by one token. # # After we pass the batch through the model, we look at only the probabilities predicted for the last token. target_indices = [] i, ic = 0, 0 for current in tqdm.trange(skip, data.size(0)) if verbose else range( skip, data.size(0)): # `current` is the character which we will ultimately predict fr = max(0, current - context) to = current + 1 instance = data[fr:to].to( torch.long) # the subsequence of the data to add to the batch # -- slice out an instance of size context + 1 (or shorter at the start of the data) # if tok is not None: # print(instance[:-1], tok.decode(instance[:-1])) # print(instance[-1:], tok.decode(instance[-1:])) target_indices.append( instance.size(0) - 2) # index of the last element of the input to the model if instance.size(0) < context + 1: assert skip < context # We shouldn't get here if we skip the first `context` characters # the index in the output tensor of the character we want to predict # -- It's context + 1, because we clip off the last token as a target pad = torch.zeros(size=(context + 1 - instance.size(0), ), dtype=torch.long) instance = torch.cat([instance, pad], dim=0) # -- the first tokens don't have enough tokens preceding them, so we pad them to the right size. assert instance.size( 0) == context + 1 # all instances should be `context` + 1 long if torch.cuda.is_available(): instance = instance.cuda() batch.append(instance[None, :]) # -- We add a singleton dimension to concatenate along later. if len(batch) == batch_size or current == data.size(0) - 1: # batch is full or we are at the last instance, run it through the model b = len(batch) ti = torch.tensor(target_indices) + 1 all = torch.cat(batch, dim=0) inputs = all[:, :-1] # input target = all[torch.arange(b), ti] # target values with torch.no_grad(): if next(model.parameters()).is_cuda: inputs = inputs.cuda() output = model(inputs) if type(output) != torch.Tensor: output = torch.log_softmax( output.logits, dim=2 ) # To make the method work for GPT2 models from Huggingface assert output.size()[:2] == ( b, context), f'was: {output.size()}, should be {(b, context, -1)}' lnprobs = output[torch.arange(b, device=d()), target_indices, target] log2probs = lnprobs / LOGE2 # -- The model produces natural logarithms of probabilities, but we need base-2 logarithms of the # probabilities, since these give us bits. if tbw is not None: for j, lp in enumerate(log2probs): i += 1 tbw.add_scalar('compression/bits-per-token', -lp, i) if tok is not None: nc = len(tok.decode(target[j])) ic += nc tbw.add_scalar('compression/bits-per-byte', -lp / nc, ic) bits += -log2probs.sum( ) # Add the bits for each character (the negative log_2 probabilities) to the running total batch, target_indices = [], [] # clear the buffer if isinstance(bits, torch.Tensor): bits = bits.item() if tok is not None: return bits, ic # total nr of bits used, total nr of characters seen else: return bits # total nr of bits used