Example #1
0
def get_answer(text: str, model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer):
    cntx_token_id, answer_token_id = tokenizer.additional_special_tokens_ids
    context = tokenizer.encode(text)
    context = [tokenizer.bos_token_id] + [cntx_token_id
                                          ] + context + [answer_token_id]
    context = torch.LongTensor([context])
    ans = model.generate(input_ids=context, max_length=100,
                         temperature=0.7)[0][1:-1]
    return tokenizer.decode(ans)
Example #2
0
def make_predictions(
    text: str,
    tokenizer: GPT2Tokenizer,
    gpt2: GPT2LMHeadModel,
    device: torch.device,
    max_output_length: int = 100,
) -> Sequence[str]:
    """Make predictions for text using GPT-2.

    Args:
        text: Input text.
        tokenizer: GPT-2 tokenizer.
        gpt2: GPT-2 model.
        device: GPT-2 device.
        max_output_length: Maximum length of generated sequence.

    Returns:
        List of predicted strings after the provided text, or an empty list if the input is over 300
        tokens long.
    """
    text = unicodedata.normalize("NFKC", text)
    input_ids = tokenizer.encode(text)
    input_ids = torch.tensor([input_ids]).to(device)  # pylint: disable=not-callable
    input_id_length = len(input_ids[0])

    # Long inputs usually result in useless outputs, so no predictions are acceptable
    if input_id_length > 300:
        return []

    # Enforce maximum generated length to prevent memory issues
    max_length = min(input_id_length + max_output_length, 350)

    with torch.cuda.amp.autocast():  # Run with FP16
        sample_outputs = gpt2.generate(
            input_ids,
            do_sample=True,
            max_length=max_length,
            min_length=2,  # We want output that is at least two words
            temperature=0.8,
            top_k=50,
            top_p=0.8,
            num_return_sequences=40,
        )

    suggestions = []
    for output in sample_outputs:
        decoded_output = result_replace(tokenizer.decode(output[input_id_length:]))
        suggestions.append(decoded_output)

    return suggestions
Example #3
0
def generate_packets(protocol,
                     n_samples,
                     model: GPT2LMHeadModel,
                     tokenizer,
                     device='cpu',
                     batch_limit=1024):
    logger.info(f'generating {n_samples} flows of "{protocol}"...')

    generated_flows = []
    tokens_to_sample = [batch_limit] * (n_samples // batch_limit)
    if n_samples % batch_limit != 0:
        # add the remainder
        tokens_to_sample += [n_samples % batch_limit]

    counter = 0
    for batch_size in tokens_to_sample:
        input_ids = torch.tensor([tokenizer.tokens_to_ids[protocol]] *
                                 batch_size,
                                 dtype=torch.long).view(batch_size,
                                                        -1).to(device)

        # no_repeat_ngram_size=1 is a dirty hack to fix duplicating pairs for 2-packet protocols
        out = model.generate(
            input_ids,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            max_length=128,
            do_sample=True,
            num_return_sequences=1,
            top_k=len(tokenizer),
            no_repeat_ngram_size=int(protocol in ['DNS', 'NTP']),
            use_cache=True,
        ).cpu()
        torch.cuda.empty_cache()
        packets = tokenizer.batch_decode_packets(out)
        generated_flows.append(packets)
        counter += batch_size
        logger.info(f'generated {counter} flows')

    target_dim_size = max(x.shape[1] for x in generated_flows)
    # pad arrays to equal out their 2nd dim
    generated_flows = list(
        map(
            lambda x: np.pad(x, ((0, 0), (0, target_dim_size - x.shape[1])),
                             constant_values=np.nan), generated_flows))
    generated_flows = np.concatenate(generated_flows, axis=0)
    return generated_flows
Example #4
0
def gen(tokenizer_tgt: Tokenizer,
        model: GPT2LMHeadModel,
        device,
        prompt=None,
        n=10,
        tokenizer_eng=None,
        token_id_map=[],
        cfg={}):
    input_ids = None
    if prompt is not None and prompt.strip() != '':
        prompt = prompt.strip()
        if type(tokenizer_tgt) == Tokenizer:
            ids = [model.config.bos_token_id] + tokenizer_tgt.encode(
                prompt, None).ids
        else:
            ids = tokenizer_tgt.encode(prompt)
        input_ids = torch.LongTensor(ids).unsqueeze(0).to(device)

    for _ in range(max(n // 5, 1)):
        m = min(5, n)

        batch_ids = model.generate(input_ids=input_ids,
                                   num_return_sequences=m,
                                   max_length=200,
                                   do_sample=True,
                                   top_k=10,
                                   top_p=0.9,
                                   temperature=2.0,
                                   repetition_penalty=10.0,
                                   num_beams=10,
                                   pad_token_id=cfg['pad_token_id'],
                                   bos_token_id=cfg['bos_token_id'],
                                   eos_token_id=cfg['eos_token_id'],
                                   no_repeat_ngram_size=4)

        for i in range(m):
            ids_tgt = batch_ids[i].flatten().tolist()
            txt_tgt = tokenizer_tgt.decode(ids_tgt,
                                           skip_special_tokens=True).strip()
            if tokenizer_eng is not None:
                ids_eng = [token_id_map[i] for i in ids_tgt if i not in [1, 2]]
                txt_eng = tokenizer_eng.decode(
                    ids_eng, skip_special_tokens=True).strip()
                yield txt_tgt, txt_eng
                continue
            yield txt_tgt
Example #5
0
def generate_sentences(model: GPT2LMHeadModel,
                       tokenizer: GPT2Tokenizer,
                       top_k: int = 50,
                       top_p: float = 0.95,
                       max_length: int = 512,
                       num_return_sequences: int = 1,
                       prompt_tokens: torch.Tensor = None) -> List[str]:
    if prompt_tokens is None:
        prompt_tokens = torch.tensor(random.randint(1, 30000))[None, None]

    if prompt_tokens.shape[1] > max_length:
        prompt_tokens = prompt_tokens[:, 0:max_length]

    sample_outputs = model.generate(input_ids=prompt_tokens,
                                    do_sample=True,
                                    top_k=50,
                                    top_p=0.95,
                                    max_length=max_length * 2,
                                    num_return_sequences=num_return_sequences)

    sentence_list = []
    for idx, sample_output in enumerate(sample_outputs):
        input_sentence = tokenizer.decode(
            prompt_tokens[0],
            skip_special_tokens=True,
        )
        generated_sentence = tokenizer.decode(
            sample_output[prompt_tokens.shape[1]:],
            skip_special_tokens=True,
        )

        sentence = (f"\n\n{' # ' * 32}\n\n"
                    f"\n\n{' # ' * 32}\n\n"
                    f"INPUT:\n\n{input_sentence}"
                    f"\n\n{' # ' * 32}\n\n"
                    f"GENERATION:\n\n{generated_sentence}")
        sentence_list.append(sentence)

    return sentence_list