def test_inference_masked_lm_long(self):
        model = LongformerForMaskedLM.from_pretrained(
            "allenai/longformer-base-4096")
        model.to(torch_device)

        # 'Hello world! ' repeated 1000 times
        input_ids = torch.tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]],
                                 dtype=torch.long,
                                 device=torch_device)  # long input
        input_ids = input_ids.to(torch_device)

        loss, prediction_scores = model(input_ids, labels=input_ids).to_tuple()

        expected_loss = torch.tensor(0.0074, device=torch_device)
        expected_prediction_scores_sum = torch.tensor(-6.1048e08,
                                                      device=torch_device)
        expected_prediction_scores_mean = torch.tensor(-3.0348,
                                                       device=torch_device)

        self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-4))
        self.assertTrue(
            torch.allclose(prediction_scores.sum(),
                           expected_prediction_scores_sum,
                           atol=1e-4))
        self.assertTrue(
            torch.allclose(prediction_scores.mean(),
                           expected_prediction_scores_mean,
                           atol=1e-4))
Exemple #2
0
 def __init__(self, config, gpu_list, *args, **params):
     super(Lawformer, self).__init__()
     # config = LongformerConfig.from_pretrained('/mnt/datadisk0/xcj/LegalBert/LegalBert/PLMConfig/roberta-converted-lfm')
     # self.LFM = LongformerForMaskedLM(config)
     self.LFM = LongformerForMaskedLM.from_pretrained(
         '/mnt/datadisk0/xcj/LegalBert/LegalBert/PLMConfig/roberta-converted-lfm'
     )
Exemple #3
0
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.save_hyperparameters()

        self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.model_name)

        longformer = LongformerForMaskedLM.from_pretrained(
            self.hparams.model_name,
            num_hidden_layers=8,
            attention_window=[128] * 8,
        )

        self.encoder = longformer.longformer

        self.encoder.embeddings.word_embeddings.weight.requires_grad_(False)

        self.entity_detection = EntityDetectionFactor(
            self.hparams.max_length_span,
            self.hparams.dropout,
            mentions_filename=self.hparams.mentions_filename,
        )

        self.entity_linking = EntityLinkingLSTM(
            self.tokenizer.bos_token_id,
            self.tokenizer.pad_token_id,
            self.tokenizer.eos_token_id,
            self.encoder.embeddings.word_embeddings,
            longformer.lm_head,
            self.hparams.dropout,
        )

        self.micro_f1 = MicroF1()
        self.micro_prec = MicroPrecision()
        self.micro_rec = MicroRecall()

        self.macro_f1 = MacroF1()
        self.macro_prec = MacroPrecision()
        self.macro_rec = MacroRecall()

        self.ed_micro_f1 = MicroF1()
        self.ed_micro_prec = MicroPrecision()
        self.ed_micro_rec = MicroRecall()

        self.ed_macro_f1 = MacroF1()
        self.ed_macro_prec = MacroPrecision()
        self.ed_macro_rec = MacroRecall()
Exemple #4
0
    def get_output_maskedLM(self, sample_text):

        model_maskedmodel = LongformerForMaskedLM.from_pretrained(
            'allenai/longformer-base-4096')
        #SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
        input_ids = torch.tensor(self.tokenizer.encode(sample_text)).unsqueeze(
            0)  # batch of size 1
        attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
        # check ``LongformerModel.forward`` for more details how to set `attention_mask`
        outputs = model_maskedmodel(input_ids,
                                    attention_mask=attention_mask,
                                    labels=input_ids,
                                    output_hidden_states=True,
                                    output_attentions=True)
        loss = outputs.loss
        prediction_logits = outputs.logits
        mean_last_hidden = torch.mean(
            outputs.hidden_states[len(outputs.hidden_states) - 1], dim=1)
        return outputs, mean_last_hidden
Exemple #5
0
    def test_inference_masked_lm(self):
        model = LongformerForMaskedLM.from_pretrained("longformer-base-4096")

        # 'Hello world! ' repeated 1000 times
        input_ids = torch.tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]
                                  ])  # long input

        loss, prediction_scores = model(input_ids, masked_lm_labels=input_ids)

        expected_loss = torch.tensor(0.0620)
        expected_prediction_scores_sum = torch.tensor(-6.1599e08)
        expected_prediction_scores_mean = torch.tensor(-3.0622)

        self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-4))
        self.assertTrue(
            torch.allclose(prediction_scores.sum(),
                           expected_prediction_scores_sum,
                           atol=1e-4))
        self.assertTrue(
            torch.allclose(prediction_scores.mean(),
                           expected_prediction_scores_mean,
                           atol=1e-4))
            '5.0',
            '--per_gpu_eval_batch_size',
            '2',
            '--per_gpu_train_batch_size',
            '1',  # 32GB gpu with fp32
            '--gradient_accumulation_steps',
            '32',
            #'--evaluate_during_training', # this is removed to reduce training time
            '--do_train',
            '--do_eval',
        ])
    train_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_train.txt'
    val_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_val.txt'
    # these are small file for test
    #     train_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_train.txt'
    #     val_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_val.txt'
    training_args.val_datapath = val_fn
    training_args.train_datapath = train_fn

    ##################### use pretrianed longformer in transformer
    longformer_model = LongformerForMaskedLM.from_pretrained(
        'allenai/longformer-base-4096')
    longformer_tokenizer = LongformerTokenizer.from_pretrained(
        'allenai/longformer-base-4096')

    logger.info('Train and eval with Longformer pretrained ...')
    pretrain_and_evaluate(training_args, longformer_model, longformer_tokenizer, eval_only=False, model_path=None\
                          #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.

                         )
Exemple #7
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    set_seed(training_args.seed)

    model = LongformerForMaskedLM.from_pretrained(
        'allenai/longformer-base-4096')
    tokenizer = LongformerTokenizer.from_pretrained(
        'allenai/longformer-base-4096')
    tokenizer.add_tokens(['<doc-s>'], special_tokens=True)
    tokenizer.add_tokens(['</doc-s>'], special_tokens=True)

    data_args.block_size = 4096

    train_dataset = get_dataset(data_args,
                                tokenizer=tokenizer,
                                local_rank=training_args.local_rank)
    model.resize_token_embeddings(len(tokenizer))
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
        mlm_probability=data_args.mlm_probability,
        globalize_special_tokens=data_args.globalize_special_tokens)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        # eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    model_path = (model_args.model_name_or_path
                  if model_args.model_name_or_path is not None
                  and os.path.isdir(model_args.model_name_or_path) else None)
    trainer.train(model_path=model_path)
    if trainer.is_world_master():
        tokenizer.save_pretrained(training_args.output_dir)

    results = {}
    logger.info("*** Evaluate ***")

    eval_output = trainer.evaluate()

    perplexity = math.exp(eval_output["loss"])
    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(training_args.output_dir,
                                    "eval_results_lm.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    results.update(result)

    return results
Exemple #8
0
    '--per_gpu_train_batch_size', '1',  # 32GB gpu with fp32
    '--gradient_accumulation_steps', '16',
    #'--evaluate_during_training', # this is removed to reduce training time
    '--do_train',
    '--do_eval',
    ])
    #train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt'
    #val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt'
    # these are small file for test
    train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients_token.txt'
    val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients_token.txt'
    training_args.train_datapath = train_fn
    training_args.val_datapath = val_fn

##################### use pretrianed longformer in transformer
    #init_config = LongformerConfig.from_json_file('config_files/longformer_base_4096/config.json')
    mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer')
    #word_embeddings =  np.loadtxt(join('/gpfs/scratch/xl3119/capstone/wd_emb',"word_embedding_matrix.txt"))
    longformer_model = LongformerForMaskedLM.from_pretrained(training_args.output_dir)
    #longformer_model = use_embeddings_fasttext(longformer_model, word_embeddings)
    # longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

    logger.info('Evaluate Longformer model with mimic tokenizer...')
    pretrain_and_evaluate(training_args,
                          longformer_model,
                          mimic_tokenizer,
                          train_only=False,
                          eval_only=True,
                          model_path=None)
                          #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.