Python GPT2LMHeadModel.generate Examples

Programming Language: Python

Namespace/Package Name: transformers

Class/Type: GPT2LMHeadModel

Method/Function: generate

Examples at hotexamples.com: 5

Python GPT2LMHeadModel.generate - 5 examples found. These are the top rated real world Python examples of transformers.GPT2LMHeadModel.generate extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GPT2LMHeadModel(30)

from_pretrained(30)

generate(5)

eval(3)

get_input_embeddings(2)

train(2)

zero_grad(2)

__init__(1)

forward(1)

from_pretrianed(1)

parameters(1)

save_pretrained(1)

to(1)

Example #1

Show file

def get_answer(text: str, model: GPT2LMHeadModel, tokenizer: GPT2Tokenizer):
    cntx_token_id, answer_token_id = tokenizer.additional_special_tokens_ids
    context = tokenizer.encode(text)
    context = [tokenizer.bos_token_id] + [cntx_token_id
                                          ] + context + [answer_token_id]
    context = torch.LongTensor([context])
    ans = model.generate(input_ids=context, max_length=100,
                         temperature=0.7)[0][1:-1]
    return tokenizer.decode(ans)

Example #2

Show file

File: gpt2.py Project: isi-vista/MASC

def make_predictions(
    text: str,
    tokenizer: GPT2Tokenizer,
    gpt2: GPT2LMHeadModel,
    device: torch.device,
    max_output_length: int = 100,
) -> Sequence[str]:
    """Make predictions for text using GPT-2.

    Args:
        text: Input text.
        tokenizer: GPT-2 tokenizer.
        gpt2: GPT-2 model.
        device: GPT-2 device.
        max_output_length: Maximum length of generated sequence.

    Returns:
        List of predicted strings after the provided text, or an empty list if the input is over 300
        tokens long.
    """
    text = unicodedata.normalize("NFKC", text)
    input_ids = tokenizer.encode(text)
    input_ids = torch.tensor([input_ids]).to(device)  # pylint: disable=not-callable
    input_id_length = len(input_ids[0])

    # Long inputs usually result in useless outputs, so no predictions are acceptable
    if input_id_length > 300:
        return []

    # Enforce maximum generated length to prevent memory issues
    max_length = min(input_id_length + max_output_length, 350)

    with torch.cuda.amp.autocast():  # Run with FP16
        sample_outputs = gpt2.generate(
            input_ids,
            do_sample=True,
            max_length=max_length,
            min_length=2,  # We want output that is at least two words
            temperature=0.8,
            top_k=50,
            top_p=0.8,
            num_return_sequences=40,
        )

    suggestions = []
    for output in sample_outputs:
        decoded_output = result_replace(tokenizer.decode(output[input_id_length:]))
        suggestions.append(decoded_output)

    return suggestions

Example #3

Show file

def generate_packets(protocol,
                     n_samples,
                     model: GPT2LMHeadModel,
                     tokenizer,
                     device='cpu',
                     batch_limit=1024):
    logger.info(f'generating {n_samples} flows of "{protocol}"...')

    generated_flows = []
    tokens_to_sample = [batch_limit] * (n_samples // batch_limit)
    if n_samples % batch_limit != 0:
        # add the remainder
        tokens_to_sample += [n_samples % batch_limit]

    counter = 0
    for batch_size in tokens_to_sample:
        input_ids = torch.tensor([tokenizer.tokens_to_ids[protocol]] *
                                 batch_size,
                                 dtype=torch.long).view(batch_size,
                                                        -1).to(device)

        # no_repeat_ngram_size=1 is a dirty hack to fix duplicating pairs for 2-packet protocols
        out = model.generate(
            input_ids,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            max_length=128,
            do_sample=True,
            num_return_sequences=1,
            top_k=len(tokenizer),
            no_repeat_ngram_size=int(protocol in ['DNS', 'NTP']),
            use_cache=True,
        ).cpu()
        torch.cuda.empty_cache()
        packets = tokenizer.batch_decode_packets(out)
        generated_flows.append(packets)
        counter += batch_size
        logger.info(f'generated {counter} flows')

    target_dim_size = max(x.shape[1] for x in generated_flows)
    # pad arrays to equal out their 2nd dim
    generated_flows = list(
        map(
            lambda x: np.pad(x, ((0, 0), (0, target_dim_size - x.shape[1])),
                             constant_values=np.nan), generated_flows))
    generated_flows = np.concatenate(generated_flows, axis=0)
    return generated_flows

Example #4

Show file

File: generate.py Project: wietsedv/gpt2-recycle

def gen(tokenizer_tgt: Tokenizer,
        model: GPT2LMHeadModel,
        device,
        prompt=None,
        n=10,
        tokenizer_eng=None,
        token_id_map=[],
        cfg={}):
    input_ids = None
    if prompt is not None and prompt.strip() != '':
        prompt = prompt.strip()
        if type(tokenizer_tgt) == Tokenizer:
            ids = [model.config.bos_token_id] + tokenizer_tgt.encode(
                prompt, None).ids
        else:
            ids = tokenizer_tgt.encode(prompt)
        input_ids = torch.LongTensor(ids).unsqueeze(0).to(device)

    for _ in range(max(n // 5, 1)):
        m = min(5, n)

        batch_ids = model.generate(input_ids=input_ids,
                                   num_return_sequences=m,
                                   max_length=200,
                                   do_sample=True,
                                   top_k=10,
                                   top_p=0.9,
                                   temperature=2.0,
                                   repetition_penalty=10.0,
                                   num_beams=10,
                                   pad_token_id=cfg['pad_token_id'],
                                   bos_token_id=cfg['bos_token_id'],
                                   eos_token_id=cfg['eos_token_id'],
                                   no_repeat_ngram_size=4)

        for i in range(m):
            ids_tgt = batch_ids[i].flatten().tolist()
            txt_tgt = tokenizer_tgt.decode(ids_tgt,
                                           skip_special_tokens=True).strip()
            if tokenizer_eng is not None:
                ids_eng = [token_id_map[i] for i in ids_tgt if i not in [1, 2]]
                txt_eng = tokenizer_eng.decode(
                    ids_eng, skip_special_tokens=True).strip()
                yield txt_tgt, txt_eng
                continue
            yield txt_tgt

Example #5

Show file

def generate_sentences(model: GPT2LMHeadModel,
                       tokenizer: GPT2Tokenizer,
                       top_k: int = 50,
                       top_p: float = 0.95,
                       max_length: int = 512,
                       num_return_sequences: int = 1,
                       prompt_tokens: torch.Tensor = None) -> List[str]:
    if prompt_tokens is None:
        prompt_tokens = torch.tensor(random.randint(1, 30000))[None, None]

    if prompt_tokens.shape[1] > max_length:
        prompt_tokens = prompt_tokens[:, 0:max_length]

    sample_outputs = model.generate(input_ids=prompt_tokens,
                                    do_sample=True,
                                    top_k=50,
                                    top_p=0.95,
                                    max_length=max_length * 2,
                                    num_return_sequences=num_return_sequences)

    sentence_list = []
    for idx, sample_output in enumerate(sample_outputs):
        input_sentence = tokenizer.decode(
            prompt_tokens[0],
            skip_special_tokens=True,
        )
        generated_sentence = tokenizer.decode(
            sample_output[prompt_tokens.shape[1]:],
            skip_special_tokens=True,
        )

        sentence = (f"\n\n{' # ' * 32}\n\n"
                    f"\n\n{' # ' * 32}\n\n"
                    f"INPUT:\n\n{input_sentence}"
                    f"\n\n{' # ' * 32}\n\n"
                    f"GENERATION:\n\n{generated_sentence}")
        sentence_list.append(sentence)

    return sentence_list