def test_inference_masked_lm(self): model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096") model.to(torch_device) # 'Hello world! ' repeated 1000 times input_ids = torch.tensor( [[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=torch.long, device=torch_device ) # long input loss, prediction_scores = model(input_ids, labels=input_ids) expected_loss = torch.tensor(0.0620, device=torch_device) expected_prediction_scores_sum = torch.tensor(-6.1599e08, device=torch_device) expected_prediction_scores_mean = torch.tensor(-3.0622, device=torch_device) input_ids = input_ids.to(torch_device) self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-4)) self.assertTrue(torch.allclose(prediction_scores.sum(), expected_prediction_scores_sum, atol=1e-4)) self.assertTrue(torch.allclose(prediction_scores.mean(), expected_prediction_scores_mean, atol=1e-4))
def create_and_check_longformer_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = LongformerForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual( result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_longformer_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = LongformerForMaskedLM(config=config) model.to(torch_device) model.eval() loss, prediction_scores = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels ) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] ) self.check_loss_output(result)
'5.0', '--per_gpu_eval_batch_size', '2', '--per_gpu_train_batch_size', '1', # 32GB gpu with fp32 '--gradient_accumulation_steps', '32', #'--evaluate_during_training', # this is removed to reduce training time '--do_train', '--do_eval', ]) train_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_train.txt' val_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_val.txt' # these are small file for test # train_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_train.txt' # val_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_val.txt' training_args.val_datapath = val_fn training_args.train_datapath = train_fn ##################### use pretrianed longformer in transformer longformer_model = LongformerForMaskedLM.from_pretrained( 'allenai/longformer-base-4096') longformer_tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') logger.info('Train and eval with Longformer pretrained ...') pretrain_and_evaluate(training_args, longformer_model, longformer_tokenizer, eval_only=False, model_path=None\ #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path. )
assert max_pos > current_max_pos new_pos_embed = roberta.bert.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size) # copy position embeddings over and over to initialize the new position embeddings k = 2 step = current_max_pos - 2 while k < max_pos - 1: if k + step >= max_pos: new_pos_embed[k:] = roberta.bert.embeddings.position_embeddings.weight[2:(max_pos + 2 - k)] else: new_pos_embed[k:(k + step)] = roberta.bert.embeddings.position_embeddings.weight[2:] k += step roberta.bert.embeddings.position_embeddings.weight.data = new_pos_embed roberta.bert.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos) # add global attention config.attention_window = [attention_window] * config.num_hidden_layers for i in range(len(roberta.bert.encoder.layer)): roberta.bert.encoder.layer[i].attention.self.query_global = copy.deepcopy(roberta.bert.encoder.layer[i].attention.self.query) roberta.bert.encoder.layer[i].attention.self.key_global = copy.deepcopy(roberta.bert.encoder.layer[i].attention.self.key) roberta.bert.encoder.layer[i].attention.self.value_global = copy.deepcopy(roberta.bert.encoder.layer[i].attention.self.value) lfm = LongformerForMaskedLM(config) lfm.longformer.load_state_dict(roberta.bert.state_dict()) lfm.lm_head.dense.load_state_dict(roberta.cls.predictions.transform.dense.state_dict()) lfm.lm_head.layer_norm.load_state_dict(roberta.cls.predictions.transform.LayerNorm.state_dict()) lfm.lm_head.decoder.load_state_dict(roberta.cls.predictions.decoder.state_dict()) lfm.lm_head.bias = copy.deepcopy(roberta.cls.predictions.bias) lfm.save_pretrained('PLMConfig/roberta-converted-lfm') tokenizer.save_pretrained('PLMConfig/roberta-converted-lfm')
'1', # 32GB gpu with fp32 '--gradient_accumulation_steps', '4', #'--evaluate_during_training', # this is removed to reduce training time '--do_train', ]) #train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt' #val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt' # these are small file for test train_fn = '/scratch/xl3119/capstone/data/sample/sample.txt' val_fn = '/scratch/xl3119/capstone/data/sample/sample.txt' training_args.train_datapath = train_fn training_args.val_datapath = val_fn ##################### use pretrianed longformer in transformer init_config = LongformerConfig.from_json_file( 'config_files/longformer_base_4096/config.json') mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer') word_embeddings = np.loadtxt( join('/scratch/xl3119/capstone/wd_emb', "word_embedding_matrix.txt")) longformer_model = LongformerForMaskedLM(init_config) longformer_model = use_embeddings_fasttext(longformer_model, word_embeddings) # longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') logger.info('Train and eval with Longformer pretrained ...') pretrain_and_evaluate(training_args, longformer_model, mimic_tokenizer, train_only=True, eval_only=False, model_path=None\ #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path. )
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) set_seed(training_args.seed) model = LongformerForMaskedLM.from_pretrained( 'allenai/longformer-base-4096') tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') tokenizer.add_tokens(['<doc-s>'], special_tokens=True) tokenizer.add_tokens(['</doc-s>'], special_tokens=True) data_args.block_size = 4096 train_dataset = get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank) model.resize_token_embeddings(len(tokenizer)) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, globalize_special_tokens=data_args.globalize_special_tokens) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, # eval_dataset=eval_dataset, prediction_loss_only=True, ) model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) results = {} logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
# create model config = LongformerConfig( attention_window=attention_window, sep_token_id=bpe_tokenizer.get_vocab()["</s>"], pad_token_id=bpe_tokenizer.get_vocab()["<pad>"], bos_token_id=bpe_tokenizer.get_vocab()["<s>"], eos_token_id=bpe_tokenizer.get_vocab()["</s>"], vocab_size=bpe_tokenizer.vocab_size, max_position_embeddings=max_len+10, num_attention_heads=num_attention_heads, num_hidden_layers=num_hidden_layers, type_vocab_size=1 ) model = LongformerForMaskedLM(config=config) _pretty_print(f"Number of model parameters : {model.num_parameters():,}") model_path = os.path.join(output_path, "lm") training_args = TrainingArguments( output_dir=model_path, overwrite_output_dir=True, num_train_epochs=epochs, per_device_train_batch_size=batch_size, per_device_eval_batch_size=val_batch_size, evaluation_strategy="steps", logging_steps=eval_steps, eval_steps=eval_steps, save_total_limit=1, load_best_model_at_end=True,
'--per_gpu_train_batch_size', '1', # 32GB gpu with fp32 '--gradient_accumulation_steps', '16', #'--evaluate_during_training', # this is removed to reduce training time '--do_train', '--do_eval', ]) #train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt' #val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt' # these are small file for test train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients_token.txt' val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients_token.txt' training_args.train_datapath = train_fn training_args.val_datapath = val_fn ##################### use pretrianed longformer in transformer #init_config = LongformerConfig.from_json_file('config_files/longformer_base_4096/config.json') mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer') #word_embeddings = np.loadtxt(join('/gpfs/scratch/xl3119/capstone/wd_emb',"word_embedding_matrix.txt")) longformer_model = LongformerForMaskedLM.from_pretrained(training_args.output_dir) #longformer_model = use_embeddings_fasttext(longformer_model, word_embeddings) # longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') logger.info('Evaluate Longformer model with mimic tokenizer...') pretrain_and_evaluate(training_args, longformer_model, mimic_tokenizer, train_only=False, eval_only=True, model_path=None) #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.