def test_inference_masked_lm_long(self): model = LongformerForMaskedLM.from_pretrained( "allenai/longformer-base-4096") model.to(torch_device) # 'Hello world! ' repeated 1000 times input_ids = torch.tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=torch.long, device=torch_device) # long input input_ids = input_ids.to(torch_device) loss, prediction_scores = model(input_ids, labels=input_ids).to_tuple() expected_loss = torch.tensor(0.0074, device=torch_device) expected_prediction_scores_sum = torch.tensor(-6.1048e08, device=torch_device) expected_prediction_scores_mean = torch.tensor(-3.0348, device=torch_device) self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-4)) self.assertTrue( torch.allclose(prediction_scores.sum(), expected_prediction_scores_sum, atol=1e-4)) self.assertTrue( torch.allclose(prediction_scores.mean(), expected_prediction_scores_mean, atol=1e-4))
def __init__(self, config, gpu_list, *args, **params): super(Lawformer, self).__init__() # config = LongformerConfig.from_pretrained('/mnt/datadisk0/xcj/LegalBert/LegalBert/PLMConfig/roberta-converted-lfm') # self.LFM = LongformerForMaskedLM(config) self.LFM = LongformerForMaskedLM.from_pretrained( '/mnt/datadisk0/xcj/LegalBert/LegalBert/PLMConfig/roberta-converted-lfm' )
def __init__(self, *args, **kwargs): super().__init__() self.save_hyperparameters() self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.model_name) longformer = LongformerForMaskedLM.from_pretrained( self.hparams.model_name, num_hidden_layers=8, attention_window=[128] * 8, ) self.encoder = longformer.longformer self.encoder.embeddings.word_embeddings.weight.requires_grad_(False) self.entity_detection = EntityDetectionFactor( self.hparams.max_length_span, self.hparams.dropout, mentions_filename=self.hparams.mentions_filename, ) self.entity_linking = EntityLinkingLSTM( self.tokenizer.bos_token_id, self.tokenizer.pad_token_id, self.tokenizer.eos_token_id, self.encoder.embeddings.word_embeddings, longformer.lm_head, self.hparams.dropout, ) self.micro_f1 = MicroF1() self.micro_prec = MicroPrecision() self.micro_rec = MicroRecall() self.macro_f1 = MacroF1() self.macro_prec = MacroPrecision() self.macro_rec = MacroRecall() self.ed_micro_f1 = MicroF1() self.ed_micro_prec = MicroPrecision() self.ed_micro_rec = MicroRecall() self.ed_macro_f1 = MacroF1() self.ed_macro_prec = MacroPrecision() self.ed_macro_rec = MacroRecall()
def get_output_maskedLM(self, sample_text): model_maskedmodel = LongformerForMaskedLM.from_pretrained( 'allenai/longformer-base-4096') #SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(self.tokenizer.encode(sample_text)).unsqueeze( 0) # batch of size 1 attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM # check ``LongformerModel.forward`` for more details how to set `attention_mask` outputs = model_maskedmodel(input_ids, attention_mask=attention_mask, labels=input_ids, output_hidden_states=True, output_attentions=True) loss = outputs.loss prediction_logits = outputs.logits mean_last_hidden = torch.mean( outputs.hidden_states[len(outputs.hidden_states) - 1], dim=1) return outputs, mean_last_hidden
def test_inference_masked_lm(self): model = LongformerForMaskedLM.from_pretrained("longformer-base-4096") # 'Hello world! ' repeated 1000 times input_ids = torch.tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2] ]) # long input loss, prediction_scores = model(input_ids, masked_lm_labels=input_ids) expected_loss = torch.tensor(0.0620) expected_prediction_scores_sum = torch.tensor(-6.1599e08) expected_prediction_scores_mean = torch.tensor(-3.0622) self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-4)) self.assertTrue( torch.allclose(prediction_scores.sum(), expected_prediction_scores_sum, atol=1e-4)) self.assertTrue( torch.allclose(prediction_scores.mean(), expected_prediction_scores_mean, atol=1e-4))
'5.0', '--per_gpu_eval_batch_size', '2', '--per_gpu_train_batch_size', '1', # 32GB gpu with fp32 '--gradient_accumulation_steps', '32', #'--evaluate_during_training', # this is removed to reduce training time '--do_train', '--do_eval', ]) train_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_train.txt' val_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_val.txt' # these are small file for test # train_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_train.txt' # val_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_val.txt' training_args.val_datapath = val_fn training_args.train_datapath = train_fn ##################### use pretrianed longformer in transformer longformer_model = LongformerForMaskedLM.from_pretrained( 'allenai/longformer-base-4096') longformer_tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') logger.info('Train and eval with Longformer pretrained ...') pretrain_and_evaluate(training_args, longformer_model, longformer_tokenizer, eval_only=False, model_path=None\ #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path. )
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) set_seed(training_args.seed) model = LongformerForMaskedLM.from_pretrained( 'allenai/longformer-base-4096') tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') tokenizer.add_tokens(['<doc-s>'], special_tokens=True) tokenizer.add_tokens(['</doc-s>'], special_tokens=True) data_args.block_size = 4096 train_dataset = get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank) model.resize_token_embeddings(len(tokenizer)) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, globalize_special_tokens=data_args.globalize_special_tokens) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, # eval_dataset=eval_dataset, prediction_loss_only=True, ) model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) results = {} logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
'--per_gpu_train_batch_size', '1', # 32GB gpu with fp32 '--gradient_accumulation_steps', '16', #'--evaluate_during_training', # this is removed to reduce training time '--do_train', '--do_eval', ]) #train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt' #val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt' # these are small file for test train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients_token.txt' val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients_token.txt' training_args.train_datapath = train_fn training_args.val_datapath = val_fn ##################### use pretrianed longformer in transformer #init_config = LongformerConfig.from_json_file('config_files/longformer_base_4096/config.json') mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer') #word_embeddings = np.loadtxt(join('/gpfs/scratch/xl3119/capstone/wd_emb',"word_embedding_matrix.txt")) longformer_model = LongformerForMaskedLM.from_pretrained(training_args.output_dir) #longformer_model = use_embeddings_fasttext(longformer_model, word_embeddings) # longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') logger.info('Evaluate Longformer model with mimic tokenizer...') pretrain_and_evaluate(training_args, longformer_model, mimic_tokenizer, train_only=False, eval_only=True, model_path=None) #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.