def create_and_check_reformer_model_fp16_generate(self, config, input_ids, input_mask): model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.half() model.eval() output = model.generate(input_ids, attention_mask=input_mask, do_sample=False) self.parent.assertFalse(torch.isnan(output).any().item())
def create_and_check_reformer_model_fp16_generate(self, config, input_ids, input_mask, choice_labels): config.is_decoder = True config.lsh_num_chunks_after = 0 model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.half() model.eval() # only use last 10 inputs for generation output = model.generate(input_ids[:, -10:], attention_mask=input_mask, do_sample=False) self.parent.assertFalse(torch.isnan(output).any().item())
def convert_trax_checkpoint_to_pytorch(trax_model_pkl_path, config_file, pytorch_dump_path): # Initialise PyTorch model config = ReformerConfig.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = ReformerModelWithLMHead(config) with open(trax_model_pkl_path, "rb") as f: model_weights = pickle.load(f)["weights"] set_model_weights_in_torch(model_weights, model, config.hidden_size) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def test_lm_model_forward(self): config = self._get_basic_config_and_input() config["attn_layers"] = ["local", "lsh", "local", "lsh", "local", "lsh"] config["num_buckets"] = [2, 4] config["is_decoder"] = False torch.manual_seed(0) model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device) model.eval() input_ids, attn_mask = self._get_input_ids_and_mask() hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0] output_slice = hidden_states[1, -1, :5] expected_output_slice = torch.tensor( [0.0324, -0.0121, 0.0615, 0.0031, -0.0297], dtype=torch.float, device=torch_device, ) self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
def get_reformer(vocab_size=77, n_layer=12, n_embd=768, n_head=12, n_positions=512, local_window_size=50, num_buckets=None, num_hashes=1): attn_layers = ["local", "local", "lsh", "local", "local", "local", "lsh", "local", "local", "local", "lsh", "local"] # attn_layers = ["local", "lsh", "local", "lsh", "local", "lsh", "local", "lsh", "local", "lsh", "local", "lsh"] config = ReformerConfig( hash_seed=None, attn_layers=attn_layers[:n_layer], # attention_head_size=128, hidden_size=n_embd, max_position_embeddings=350, feed_forward_size=3072, vocab_size=vocab_size, is_decoder=True, axial_pos_embds_dim=[256, 512], axial_pos_shape=[14, 25], num_hashes=num_hashes, num_buckets=num_buckets, local_attn_chunk_length=local_window_size, # num_buckets=num_buckets, lsh_attn_chunk_length=local_window_size, num_attention_heads=n_head, # lsh_attention_probs_dropout_prob=0.1, # local_attention_probs_dropout_prob=0.1, # hidden_dropout_prob=0.1, chunk_size_feed_forward=0, chunk_size_lm_head=0, eos_token_id=2, hidden_act='relu', ) return ReformerModelWithLMHead(config=config)
def main(): # let's use > 0.5M samples per sample padded_sequence_length = 2**19 # reduce dataset to one example dataset = prepare_dataset(padded_sequence_length) # the non_padded_sequence_length defines the max shift for our data collator non_padded_sequence_length = padded_sequence_length - sum( dataset["attention_mask"][0]) # use a special data collator that randomely shifts the input_ids data_collator = ReformerCollator(non_padded_sequence_length) # create reformer config and init model config = create_reformer_config() model = ReformerModelWithLMHead(config) # create training params training_args = get_training_args() # create the trainer trainer = Trainer( model=model, args=training_args, compute_metrics=compute_metrics, data_collator=data_collator, train_dataset=dataset, eval_dataset=dataset, ) # train trainer.train()
def test_pretrained_generate_use_cache_equality(self): model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device) tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment") model.eval() input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device) output_ids_with_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=False) output_ids_without_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=True) output_with_cache = tokenizer.decode(output_ids_with_cache[0]) output_without_cache = tokenizer.decode(output_ids_without_cache[0]) self.assertEqual(output_with_cache, output_without_cache)
def test_lsh_lm_model_grad(self): config = self._get_basic_config_and_input() config["attn_layers"] = ["lsh", "lsh", "lsh", "lsh"] config["hidden_dropout_prob"] = 0.0 config["lsh_attention_probs_dropout_prob"] = 0.0 config["num_buckets"] = [2, 4] config["num_hashes"] = 6 torch.manual_seed(0) model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device) model.train() model.zero_grad() input_ids, _ = self._get_input_ids_and_mask() loss = model(input_ids=input_ids, labels=input_ids)[0] self.assertTrue(torch.allclose(loss, torch.tensor(5.7819, dtype=torch.float, device=torch_device), atol=1e-3)) loss.backward() # check last grads to cover all proable errors grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5] expected_grad_slice_word = torch.tensor( [2.6357e-05, 4.3358e-04, -8.4985e-04, 1.0094e-04, 3.8954e-04], dtype=torch.float, device=torch_device, ) grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:] expected_grad_slice_pos_fac_1 = torch.tensor( [-0.0984, 0.6283, 0.4282, 1.2960, 0.6897], dtype=torch.float, device=torch_device, ) grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5] expected_grad_slice_pos_fac_2 = torch.tensor( [0.4626, -0.0231, -0.0172, 0.1081, 0.3805], dtype=torch.float, device=torch_device, ) self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3)) self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3)) self.assertTrue(torch.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
def create_and_check_past_buckets_states(self, config, input_ids, input_mask, choice_labels): config.is_decoder = True config.lsh_num_chunks_before = 1 config.lsh_num_chunks_after = 0 model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.eval() input_ids_first = input_ids[:, :-1] input_ids_second = input_ids[:, -1:] # return saved cache past_buckets_states = model(input_ids_first, use_cache=True)["past_buckets_states"] # calculate last output with and without cache outputs_with_cache = model(input_ids_second, past_buckets_states=past_buckets_states, use_cache=True)["logits"] outputs_without_cache = model(input_ids)["logits"][:, -1] # select random slice idx random_slice_idx = torch.randint(outputs_without_cache.shape[-1], (1, 1), device=torch_device).item() # outputs should be similar within range self.parent.assertTrue( torch.allclose( outputs_with_cache[:, 0, random_slice_idx], outputs_without_cache[:, random_slice_idx], atol=1e-2 ) )
def create_and_check_reformer_model_with_lm_backward( self, config, input_ids, input_mask, choice_labels): model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.eval() loss = model(input_ids, attention_mask=input_mask, labels=input_ids)[0] loss.backward()
def test_local_lm_model_grad(self): config = self._get_basic_config_and_input() config["attn_layers"] = ["local", "local", "local", "local"] config["hidden_dropout_prob"] = 0.0 config["local_attention_probs_dropout_prob"] = 0.0 torch.manual_seed(0) model = ReformerModelWithLMHead(ReformerConfig(**config)).to(torch_device) model.train() model.zero_grad() input_ids, _ = self._get_input_ids_and_mask() loss = model(input_ids=input_ids, labels=input_ids)[0] self.assertTrue(torch.allclose(loss, torch.tensor(5.7786, dtype=torch.float, device=torch_device), atol=1e-3)) loss.backward() # check last grads to cover all proable errors grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5] expected_grad_slice_word = torch.tensor( [-0.0005, 0.0001, 0.0002, 0.0003, 0.0006], dtype=torch.float, device=torch_device, ) grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:] expected_grad_slice_pos_fac_1 = torch.tensor( [0.0037, -1.3793, -1.0231, -1.5230, -2.5306], dtype=torch.float, device=torch_device, ) grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5] expected_grad_slice_pos_fac_2 = torch.tensor( [-1.3165, 0.5168, 0.7785, 1.0811, -0.9830], dtype=torch.float, device=torch_device, ) self.assertTrue(torch.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3)) self.assertTrue(torch.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3)) self.assertTrue(torch.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
def create_and_check_reformer_with_lm(self, config, input_ids, input_mask, choice_labels): config.lsh_num_chunks_after = 0 config.is_decoder = True model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=input_ids) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_reformer_model_generate(self, config, input_ids, input_mask, choice_labels): config.is_decoder = True config.lsh_num_chunks_after = 0 config.bos_token_id = 0 config.eos_token_id = None config.max_length = 20 model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.eval() output = model.generate() self.parent.assertIsNotNone(output)
def test_pretrained_generate_crime_and_punish(self): model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device) tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment") model.eval() input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device) output_ids = model.generate( input_ids, max_length=50, num_beams=4, early_stopping=True, do_sample=False, num_hashes=8 ) output_text = tokenizer.decode(output_ids[0]) self.assertEqual( output_text, "A few months later state expression in his ideas, at the first entrance. He was positively for an inst", )
def create_and_check_reformer_no_chunking(self, config, input_ids, input_mask, choice_labels): # force chunk length to be bigger than input_ids config.lsh_attn_chunk_length = 2 * input_ids.shape[-1] config.local_attn_chunk_length = 2 * input_ids.shape[-1] model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.eval() output_logits = model(input_ids, attention_mask=input_mask)[0] self.parent.assertTrue(output_logits.shape[1] == input_ids.shape[-1])
def create_and_check_reformer_with_lm(self, config, input_ids, input_mask, choice_labels): model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, labels=input_ids) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) self.check_loss_output(result)
def __init__(self, text_processor: TextProcessor, config: ReformerConfig = None, size: int = 1): """ :param size: config size: 1 small, 2 medium, 3 base. """ super(ReformerLM, self).__init__() self.text_processor: TextProcessor = text_processor if config is not None: self.config = config else: config_func = _small_config if size == 1 else ( _base_config if size == 3 else _medium_config) self.config = config_func( vocab_size=text_processor.tokenizer.get_vocab_size(), pad_token_id=text_processor.pad_token_id(), eos_token_id=text_processor.sep_token_id()) self.config = ReformerConfig(**self.config) reformer = ReformerModelWithLMHead(self.config) self.lm_head: ReformerOnlyLMHead = reformer.lm_head self.encoder: ReformerModel = reformer.reformer
def test_model_from_pretrained(self): for model_name in REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = ReformerModelWithLMHead.from_pretrained(model_name) self.assertIsNotNone(model)
def create_and_check_reformer_feed_backward_chunking( self, config, input_ids, input_mask, choice_labels): if not self.is_training: return # disable dropout config.hidden_dropout_prob = 0 config.local_attention_probs_dropout_prob = 0 config.lsh_attention_probs_dropout_prob = 0 torch.manual_seed(0) model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.train() model.zero_grad() loss_no_chunk, output_no_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2] loss_no_chunk.backward() grad_slice_word_no_chunk = model.reformer.embeddings.word_embeddings.weight.grad[ 0, :5] grad_slice_position_factor_1_no_chunk = model.reformer.embeddings.position_embeddings.weights[ 0][1, 0, -5:] grad_slice_position_factor_2_no_chunk = model.reformer.embeddings.position_embeddings.weights[ 1][0, 1, :5] config.chunk_size_lm_head = 1 config.chunk_size_feed_forward = 1 torch.manual_seed(0) model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.train() model.zero_grad() loss_chunk, output_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2] loss_chunk.backward() grad_slice_word_chunk = model.reformer.embeddings.word_embeddings.weight.grad[ 0, :5] grad_slice_position_factor_1_chunk = model.reformer.embeddings.position_embeddings.weights[ 0][1, 0, -5:] grad_slice_position_factor_2_chunk = model.reformer.embeddings.position_embeddings.weights[ 1][0, 1, :5] self.parent.assertTrue( torch.allclose(loss_chunk, loss_no_chunk, atol=1e-3)) self.parent.assertTrue( torch.allclose(grad_slice_word_no_chunk, grad_slice_word_chunk, atol=1e-3)) self.parent.assertTrue( torch.allclose(grad_slice_position_factor_1_chunk, grad_slice_position_factor_1_no_chunk, atol=1e-3)) self.parent.assertTrue( torch.allclose(grad_slice_position_factor_2_chunk, grad_slice_position_factor_2_no_chunk, atol=1e-3))
# Decoding def decode(outputs_ids): decoded_outputs = [] o = outputs_ids.tolist() if torch.is_tensor( outputs_ids) else outputs_ids for output_ids in o: # transform id back to char IDs < 2 are simply transformed to "" decoded_outputs.append("".join( [chr(x - 2) if x > 1 else "" for x in output_ids])) return decoded_outputs from transformers import ReformerModelWithLMHead, ReformerForMaskedLM # transformers.ReformerModel - raw hidden states # ReformerForMaskedLM - UGH THIS IS WHAT I WANT # ReformerModelWithLMHead - next token prediction ONLY model = ReformerModelWithLMHead.from_pretrained("google/reformer-enwik8") encoded, attention_masks = encode( ["In 1965, Brooks left IBM to found the Department of"]) x = model.generate(encoded, do_sample=True, max_length=150) d = decode(x) input_ids, attention_masks = encode( ["In 1965, Brooks left IBM to found the Department of"]) #i,a = input_ids.to("cuda"), attention_masks.to("cuda") sentence = "The quick brown fox jumps over the lazy dog." input_ids, attention_masks = encode([sentence]) attention_masks[0, 37] = attention_masks[0, 19] = attention_masks[0, 27] = 0 i, a = input_ids, attention_masks f = model.forward(input_ids=i,
from pydantic import BaseModel, Field from transformers import ReformerModelWithLMHead, ReformerTokenizer tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment') model = ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment') ## # GPT-2 generator. # Make java code!. def mk_crime_punish(text, length, how_many, top_p, top_k, do_sample): try: input_ids = tokenizer.encode(text, return_tensors='pt') min_length = len(input_ids.tolist()[0]) length += min_length length = length if length > 0 else 1 top_k = top_k if top_k > 0 else 10 top_p = top_p if top_p > 0 else 0.5 # model generating sample_outputs = model.generate(input_ids, pad_token_id=50256, do_sample=do_sample, max_length=length, top_p=top_p, top_k=top_k, num_return_sequences=how_many) result = dict()
train_dataset = SequenceDataset.prepare_from_file("data/yeast/yeast_train.txt", tokenizer) val_dataset = SequenceDataset.prepare_from_file("data/yeast/yeast_val.txt", tokenizer) train_loader = cycle(DataLoader(train_dataset, batch_size=BATCH_SIZE)) val_loader = cycle(DataLoader(val_dataset, batch_size=BATCH_SIZE)) # configuration = ReformerConfig.from_pretrained("google/reformer-crime-and-punishment") # configuration.axial_pos_shape = (64, 72) # configuration.max_position_embeddings=SEQ_LEN # configuration.vocab_size=tokenizer.vocab_size # configuration.save_pretrained('model/config/') configuration = ReformerConfig.from_pretrained('model/config/') model = ReformerModelWithLMHead(configuration) model.cuda() NUM_BATCHES = len(train_dataset) // BATCH_SIZE from transformers import AdamW optimizer = AdamW(params=model.parameters(), lr=LEARNING_RATE) from collections import OrderedDict import json all_training_loss = OrderedDict() all_val_loss = OrderedDict() for x in range(1): print(f"epoch {x}")