Example #1
0
    def test_pretrained_generate_use_cache_equality(self):
        model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
        tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
        model.eval()
        input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
        output_ids_with_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=False)
        output_ids_without_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=True)

        output_with_cache = tokenizer.decode(output_ids_with_cache[0])
        output_without_cache = tokenizer.decode(output_ids_without_cache[0])

        self.assertEqual(output_with_cache, output_without_cache)
    def test_pretrained_generate_crime_and_punish(self):
        model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
        tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
        model.eval()

        input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
        output_ids = model.generate(
            input_ids, max_length=50, num_beams=4, early_stopping=True, do_sample=False, num_hashes=8
        )
        output_text = tokenizer.decode(output_ids[0])
        self.assertEqual(
            output_text,
            "A few months later state expression in his ideas, at the first entrance. He was positively for an inst",
        )
Example #3
0
 def test_model_from_pretrained(self):
     for model_name in REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         model = ReformerModelWithLMHead.from_pretrained(model_name)
         self.assertIsNotNone(model)
Example #4
0
    # Decoding
    def decode(outputs_ids):
        decoded_outputs = []
        o = outputs_ids.tolist() if torch.is_tensor(
            outputs_ids) else outputs_ids
        for output_ids in o:
            # transform id back to char IDs < 2 are simply transformed to ""
            decoded_outputs.append("".join(
                [chr(x - 2) if x > 1 else "" for x in output_ids]))
        return decoded_outputs

    from transformers import ReformerModelWithLMHead, ReformerForMaskedLM
    # transformers.ReformerModel - raw hidden states
    # ReformerForMaskedLM - UGH THIS IS WHAT I WANT
    # ReformerModelWithLMHead - next token prediction ONLY
    model = ReformerModelWithLMHead.from_pretrained("google/reformer-enwik8")
    encoded, attention_masks = encode(
        ["In 1965, Brooks left IBM to found the Department of"])
    x = model.generate(encoded, do_sample=True, max_length=150)
    d = decode(x)

    input_ids, attention_masks = encode(
        ["In 1965, Brooks left IBM to found the Department of"])
    #i,a = input_ids.to("cuda"), attention_masks.to("cuda")

    sentence = "The quick brown fox jumps over the lazy dog."
    input_ids, attention_masks = encode([sentence])
    attention_masks[0, 37] = attention_masks[0, 19] = attention_masks[0,
                                                                      27] = 0
    i, a = input_ids, attention_masks
    f = model.forward(input_ids=i,
Example #5
0
from pydantic import BaseModel, Field
from transformers import ReformerModelWithLMHead, ReformerTokenizer

tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
model = ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment')


##
# GPT-2 generator.
# Make java code!.
def mk_crime_punish(text, length, how_many, top_p, top_k, do_sample):
    try:
        input_ids = tokenizer.encode(text, return_tensors='pt')

        min_length = len(input_ids.tolist()[0])
        length += min_length

        length = length if length > 0 else 1
        top_k = top_k if top_k > 0 else 10
        top_p = top_p if top_p > 0 else 0.5

        # model generating
        sample_outputs = model.generate(input_ids, pad_token_id=50256,
                                        do_sample=do_sample,
                                        max_length=length,
                                        top_p=top_p,
                                        top_k=top_k,
                                        num_return_sequences=how_many)

        result = dict()