Ejemplo n.º 1
0
    def test_inference_masked_lm(self):
        model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
        model.to(torch_device)

        # 'Hello world! ' repeated 1000 times
        input_ids = torch.tensor(
            [[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=torch.long, device=torch_device
        )  # long input

        loss, prediction_scores = model(input_ids, labels=input_ids)

        expected_loss = torch.tensor(0.0620, device=torch_device)
        expected_prediction_scores_sum = torch.tensor(-6.1599e08, device=torch_device)
        expected_prediction_scores_mean = torch.tensor(-3.0622, device=torch_device)
        input_ids = input_ids.to(torch_device)

        self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-4))
        self.assertTrue(torch.allclose(prediction_scores.sum(), expected_prediction_scores_sum, atol=1e-4))
        self.assertTrue(torch.allclose(prediction_scores.mean(), expected_prediction_scores_mean, atol=1e-4))
 def create_and_check_longformer_for_masked_lm(self, config, input_ids,
                                               token_type_ids, input_mask,
                                               sequence_labels,
                                               token_labels, choice_labels):
     model = LongformerForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids,
                    attention_mask=input_mask,
                    token_type_ids=token_type_ids,
                    labels=token_labels)
     self.parent.assertEqual(
         result.logits.shape,
         (self.batch_size, self.seq_length, self.vocab_size))
 def create_and_check_longformer_for_masked_lm(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = LongformerForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     loss, prediction_scores = model(
         input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
     )
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
     )
     self.check_loss_output(result)
            '5.0',
            '--per_gpu_eval_batch_size',
            '2',
            '--per_gpu_train_batch_size',
            '1',  # 32GB gpu with fp32
            '--gradient_accumulation_steps',
            '32',
            #'--evaluate_during_training', # this is removed to reduce training time
            '--do_train',
            '--do_eval',
        ])
    train_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_train.txt'
    val_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_val.txt'
    # these are small file for test
    #     train_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_train.txt'
    #     val_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_val.txt'
    training_args.val_datapath = val_fn
    training_args.train_datapath = train_fn

    ##################### use pretrianed longformer in transformer
    longformer_model = LongformerForMaskedLM.from_pretrained(
        'allenai/longformer-base-4096')
    longformer_tokenizer = LongformerTokenizer.from_pretrained(
        'allenai/longformer-base-4096')

    logger.info('Train and eval with Longformer pretrained ...')
    pretrain_and_evaluate(training_args, longformer_model, longformer_tokenizer, eval_only=False, model_path=None\
                          #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.

                         )
Ejemplo n.º 5
0
assert max_pos > current_max_pos

new_pos_embed = roberta.bert.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size)
# copy position embeddings over and over to initialize the new position embeddings
k = 2
step = current_max_pos - 2
while k < max_pos - 1:
    if k + step >= max_pos:
        new_pos_embed[k:] = roberta.bert.embeddings.position_embeddings.weight[2:(max_pos + 2 - k)]
    else:
        new_pos_embed[k:(k + step)] = roberta.bert.embeddings.position_embeddings.weight[2:]
    k += step
roberta.bert.embeddings.position_embeddings.weight.data = new_pos_embed
roberta.bert.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)

# add global attention
config.attention_window = [attention_window] * config.num_hidden_layers
for i in range(len(roberta.bert.encoder.layer)):
    roberta.bert.encoder.layer[i].attention.self.query_global = copy.deepcopy(roberta.bert.encoder.layer[i].attention.self.query)
    roberta.bert.encoder.layer[i].attention.self.key_global = copy.deepcopy(roberta.bert.encoder.layer[i].attention.self.key)
    roberta.bert.encoder.layer[i].attention.self.value_global = copy.deepcopy(roberta.bert.encoder.layer[i].attention.self.value)

lfm = LongformerForMaskedLM(config)
lfm.longformer.load_state_dict(roberta.bert.state_dict())
lfm.lm_head.dense.load_state_dict(roberta.cls.predictions.transform.dense.state_dict())
lfm.lm_head.layer_norm.load_state_dict(roberta.cls.predictions.transform.LayerNorm.state_dict())
lfm.lm_head.decoder.load_state_dict(roberta.cls.predictions.decoder.state_dict())
lfm.lm_head.bias = copy.deepcopy(roberta.cls.predictions.bias)

lfm.save_pretrained('PLMConfig/roberta-converted-lfm')
tokenizer.save_pretrained('PLMConfig/roberta-converted-lfm')
Ejemplo n.º 6
0
            '1',  # 32GB gpu with fp32
            '--gradient_accumulation_steps',
            '4',
            #'--evaluate_during_training', # this is removed to reduce training time
            '--do_train',
        ])
    #train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt'
    #val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt'
    # these are small file for test
    train_fn = '/scratch/xl3119/capstone/data/sample/sample.txt'
    val_fn = '/scratch/xl3119/capstone/data/sample/sample.txt'
    training_args.train_datapath = train_fn
    training_args.val_datapath = val_fn

    ##################### use pretrianed longformer in transformer
    init_config = LongformerConfig.from_json_file(
        'config_files/longformer_base_4096/config.json')
    mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer')
    word_embeddings = np.loadtxt(
        join('/scratch/xl3119/capstone/wd_emb', "word_embedding_matrix.txt"))
    longformer_model = LongformerForMaskedLM(init_config)
    longformer_model = use_embeddings_fasttext(longformer_model,
                                               word_embeddings)
    # longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

    logger.info('Train and eval with Longformer pretrained ...')
    pretrain_and_evaluate(training_args, longformer_model, mimic_tokenizer, train_only=True, eval_only=False, model_path=None\
                          #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.

                         )
Ejemplo n.º 7
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    set_seed(training_args.seed)

    model = LongformerForMaskedLM.from_pretrained(
        'allenai/longformer-base-4096')
    tokenizer = LongformerTokenizer.from_pretrained(
        'allenai/longformer-base-4096')
    tokenizer.add_tokens(['<doc-s>'], special_tokens=True)
    tokenizer.add_tokens(['</doc-s>'], special_tokens=True)

    data_args.block_size = 4096

    train_dataset = get_dataset(data_args,
                                tokenizer=tokenizer,
                                local_rank=training_args.local_rank)
    model.resize_token_embeddings(len(tokenizer))
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
        mlm_probability=data_args.mlm_probability,
        globalize_special_tokens=data_args.globalize_special_tokens)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        # eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    model_path = (model_args.model_name_or_path
                  if model_args.model_name_or_path is not None
                  and os.path.isdir(model_args.model_name_or_path) else None)
    trainer.train(model_path=model_path)
    if trainer.is_world_master():
        tokenizer.save_pretrained(training_args.output_dir)

    results = {}
    logger.info("*** Evaluate ***")

    eval_output = trainer.evaluate()

    perplexity = math.exp(eval_output["loss"])
    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(training_args.output_dir,
                                    "eval_results_lm.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    results.update(result)

    return results
    # create model
    config = LongformerConfig(
        attention_window=attention_window,
        sep_token_id=bpe_tokenizer.get_vocab()["</s>"],
        pad_token_id=bpe_tokenizer.get_vocab()["<pad>"],
        bos_token_id=bpe_tokenizer.get_vocab()["<s>"], 
        eos_token_id=bpe_tokenizer.get_vocab()["</s>"],
        vocab_size=bpe_tokenizer.vocab_size,
        max_position_embeddings=max_len+10,
        num_attention_heads=num_attention_heads,
        num_hidden_layers=num_hidden_layers,
        type_vocab_size=1
    )
    
    model = LongformerForMaskedLM(config=config)

    _pretty_print(f"Number of model parameters : {model.num_parameters():,}")

    model_path = os.path.join(output_path, "lm")
    training_args = TrainingArguments(
        output_dir=model_path,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=val_batch_size,
        evaluation_strategy="steps",
        logging_steps=eval_steps,
        eval_steps=eval_steps,
        save_total_limit=1,
        load_best_model_at_end=True,
Ejemplo n.º 9
0
    '--per_gpu_train_batch_size', '1',  # 32GB gpu with fp32
    '--gradient_accumulation_steps', '16',
    #'--evaluate_during_training', # this is removed to reduce training time
    '--do_train',
    '--do_eval',
    ])
    #train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt'
    #val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt'
    # these are small file for test
    train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients_token.txt'
    val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients_token.txt'
    training_args.train_datapath = train_fn
    training_args.val_datapath = val_fn

##################### use pretrianed longformer in transformer
    #init_config = LongformerConfig.from_json_file('config_files/longformer_base_4096/config.json')
    mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer')
    #word_embeddings =  np.loadtxt(join('/gpfs/scratch/xl3119/capstone/wd_emb',"word_embedding_matrix.txt"))
    longformer_model = LongformerForMaskedLM.from_pretrained(training_args.output_dir)
    #longformer_model = use_embeddings_fasttext(longformer_model, word_embeddings)
    # longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

    logger.info('Evaluate Longformer model with mimic tokenizer...')
    pretrain_and_evaluate(training_args,
                          longformer_model,
                          mimic_tokenizer,
                          train_only=False,
                          eval_only=True,
                          model_path=None)
                          #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.