def get_answer(text: str, model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer): cntx_token_id, answer_token_id = tokenizer.additional_special_tokens_ids context = tokenizer.encode(text) context = [tokenizer.bos_token_id] + [cntx_token_id ] + context + [answer_token_id] context = torch.LongTensor([context]) ans = model.generate(input_ids=context, max_length=100, temperature=0.7)[0][1:-1] return tokenizer.decode(ans)
def make_predictions( text: str, tokenizer: GPT2Tokenizer, gpt2: GPT2LMHeadModel, device: torch.device, max_output_length: int = 100, ) -> Sequence[str]: """Make predictions for text using GPT-2. Args: text: Input text. tokenizer: GPT-2 tokenizer. gpt2: GPT-2 model. device: GPT-2 device. max_output_length: Maximum length of generated sequence. Returns: List of predicted strings after the provided text, or an empty list if the input is over 300 tokens long. """ text = unicodedata.normalize("NFKC", text) input_ids = tokenizer.encode(text) input_ids = torch.tensor([input_ids]).to(device) # pylint: disable=not-callable input_id_length = len(input_ids[0]) # Long inputs usually result in useless outputs, so no predictions are acceptable if input_id_length > 300: return [] # Enforce maximum generated length to prevent memory issues max_length = min(input_id_length + max_output_length, 350) with torch.cuda.amp.autocast(): # Run with FP16 sample_outputs = gpt2.generate( input_ids, do_sample=True, max_length=max_length, min_length=2, # We want output that is at least two words temperature=0.8, top_k=50, top_p=0.8, num_return_sequences=40, ) suggestions = [] for output in sample_outputs: decoded_output = result_replace(tokenizer.decode(output[input_id_length:])) suggestions.append(decoded_output) return suggestions
def generate_packets(protocol, n_samples, model: GPT2LMHeadModel, tokenizer, device='cpu', batch_limit=1024): logger.info(f'generating {n_samples} flows of "{protocol}"...') generated_flows = [] tokens_to_sample = [batch_limit] * (n_samples // batch_limit) if n_samples % batch_limit != 0: # add the remainder tokens_to_sample += [n_samples % batch_limit] counter = 0 for batch_size in tokens_to_sample: input_ids = torch.tensor([tokenizer.tokens_to_ids[protocol]] * batch_size, dtype=torch.long).view(batch_size, -1).to(device) # no_repeat_ngram_size=1 is a dirty hack to fix duplicating pairs for 2-packet protocols out = model.generate( input_ids, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, max_length=128, do_sample=True, num_return_sequences=1, top_k=len(tokenizer), no_repeat_ngram_size=int(protocol in ['DNS', 'NTP']), use_cache=True, ).cpu() torch.cuda.empty_cache() packets = tokenizer.batch_decode_packets(out) generated_flows.append(packets) counter += batch_size logger.info(f'generated {counter} flows') target_dim_size = max(x.shape[1] for x in generated_flows) # pad arrays to equal out their 2nd dim generated_flows = list( map( lambda x: np.pad(x, ((0, 0), (0, target_dim_size - x.shape[1])), constant_values=np.nan), generated_flows)) generated_flows = np.concatenate(generated_flows, axis=0) return generated_flows
def gen(tokenizer_tgt: Tokenizer, model: GPT2LMHeadModel, device, prompt=None, n=10, tokenizer_eng=None, token_id_map=[], cfg={}): input_ids = None if prompt is not None and prompt.strip() != '': prompt = prompt.strip() if type(tokenizer_tgt) == Tokenizer: ids = [model.config.bos_token_id] + tokenizer_tgt.encode( prompt, None).ids else: ids = tokenizer_tgt.encode(prompt) input_ids = torch.LongTensor(ids).unsqueeze(0).to(device) for _ in range(max(n // 5, 1)): m = min(5, n) batch_ids = model.generate(input_ids=input_ids, num_return_sequences=m, max_length=200, do_sample=True, top_k=10, top_p=0.9, temperature=2.0, repetition_penalty=10.0, num_beams=10, pad_token_id=cfg['pad_token_id'], bos_token_id=cfg['bos_token_id'], eos_token_id=cfg['eos_token_id'], no_repeat_ngram_size=4) for i in range(m): ids_tgt = batch_ids[i].flatten().tolist() txt_tgt = tokenizer_tgt.decode(ids_tgt, skip_special_tokens=True).strip() if tokenizer_eng is not None: ids_eng = [token_id_map[i] for i in ids_tgt if i not in [1, 2]] txt_eng = tokenizer_eng.decode( ids_eng, skip_special_tokens=True).strip() yield txt_tgt, txt_eng continue yield txt_tgt
def generate_sentences(model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer, top_k: int = 50, top_p: float = 0.95, max_length: int = 512, num_return_sequences: int = 1, prompt_tokens: torch.Tensor = None) -> List[str]: if prompt_tokens is None: prompt_tokens = torch.tensor(random.randint(1, 30000))[None, None] if prompt_tokens.shape[1] > max_length: prompt_tokens = prompt_tokens[:, 0:max_length] sample_outputs = model.generate(input_ids=prompt_tokens, do_sample=True, top_k=50, top_p=0.95, max_length=max_length * 2, num_return_sequences=num_return_sequences) sentence_list = [] for idx, sample_output in enumerate(sample_outputs): input_sentence = tokenizer.decode( prompt_tokens[0], skip_special_tokens=True, ) generated_sentence = tokenizer.decode( sample_output[prompt_tokens.shape[1]:], skip_special_tokens=True, ) sentence = (f"\n\n{' # ' * 32}\n\n" f"\n\n{' # ' * 32}\n\n" f"INPUT:\n\n{input_sentence}" f"\n\n{' # ' * 32}\n\n" f"GENERATION:\n\n{generated_sentence}") sentence_list.append(sentence) return sentence_list