Example #1
0
 def create_and_check_for_masked_lm(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = RobertaForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
Example #2
0
 def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
                                            token_labels, choice_labels):
     model = RobertaForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
def main():
    parser = argparse.ArgumentParser()
    #model arguments
    parser.add_argument("--model_type", default='roberta', type=str)
    parser.add_argument("--model_name_or_path",
                        default='roberta-base',
                        type=str)
    #data arguments
    parser.add_argument("--output_dir", default="./output", type=str)
    parser.add_argument("--train_data_file", default=None, type=str)
    parser.add_argument("--eval_data_file", default=None, type=str)
    parser.add_argument("--mlm_probability", default=0.15, type=float)
    parser.add_argument("--block_size", default=-1, type=int)
    #training arguments
    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int)
    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
    parser.add_argument("--learning_rate", default=5e-5, type=float)
    parser.add_argument("--weight_decay", default=0.0, type=float)
    parser.add_argument("--adam_beta1", default=0.9, type=float)
    parser.add_argument("--adam_beta2", default=0.999, type=float)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)
    parser.add_argument("--max_grad_norm", default=1.0, type=float)
    parser.add_argument("--num_train_epochs", default=1.0, type=float)
    parser.add_argument(
        "--max_steps", default=-1, type=int
    )  #If > 0: set total number of training steps to perform. Override num_train_epochs.
    parser.add_argument("--warmup_steps", default=0,
                        type=int)  #Linear warmup over warmup_steps.
    parser.add_argument("--logging_steps", type=int,
                        default=-1)  #help="Log every X updates steps.
    parser.add_argument(
        "--save_steps", type=int,
        default=-1)  #help="Save checkpoint every X updates steps.
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument(
        "--dataloader_drop_last", type=bool, default=False
    )  #"Drop the last incomplete batch if it is not divisible by the batch size
    parser.add_argument("--device", type=str, default='cuda')
    args, _ = parser.parse_known_args()

    logger = logging.getLogger(__name__)
    set_seed(args)

    #load config and tokenizer
    config = AutoConfig.from_pretrained(args.model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    # load model weights from vanilla RoBERTa model
    model = RobertaForMaskedLM(config=config, )
    #Or resume training from saved checkpoint with below code
    #model.load_state_dict(torch.load("../input/biomedical-questionanswer/roberta-base-pretrain-pubmed8252.bin"))
    model.to(args.device)

    # load data and train the model
    print("load the pubmed abstract text and generate dataset")
    file_path = '../input/biomedical-questionanswer/abstract.txt'
    inputs_ids = text_to_ids_tensor(file_path, tokenizer)
    inputs_ids, labels = mask_tokens(inputs_ids, tokenizer, args)
    train_dataset = TensorDataset(inputs_ids, labels)
    print("start to train")
    global_step, tr_loss = train(train_dataset, model, tokenizer, args)

    #Save the pretrained model
    print("save the model")
    output_dir = "roberta-base-pretrain-pubmed.bin"
    torch.save(model.state_dict(), output_dir)
Example #4
0
    hidden_size=Config['embedding_dim'],
    num_attention_heads=Config['attention_heads'],
    num_hidden_layers=Config['encoder_layers'],
    intermediate_size=Config['intermediate_size'],
    type_vocab_size=Config['type_vocab_size'])

if Config['last_ckpt_path'] == None or Config['last_ckpt_path'] == '':
    model = RobertaForMaskedLM(config)
else:
    model = RobertaForMaskedLM.from_pretrained(Config['last_ckpt_path'],
                                               config=config)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')
# and move our model over to the selected device
model.to(device)

model.train()  # activate training mode
optim = torch.optim.AdamW(model.parameters(), lr=Config['learning_rate'])

dt_str = train_start_datetime.strftime("D%Y_%m_%d_T%H_%M_%S")
model_folder = os.path.join(Config['model_path'], dt_str)
ckpt_path = os.path.join(model_folder, 'checkpoints')
config_path = os.path.join(model_folder, 'train_config.json')
results_path = os.path.join(model_folder, 'results.json')

start_epoch = Config['start_epoch']
end_epoch = start_epoch + Config['num_epochs']
results = {}

for epoch in range(start_epoch, end_epoch):