def create_and_check_albert_for_multiple_choice(
         self, config, input_ids, token_type_ids, input_mask,
         sequence_labels, token_labels, choice_labels):
     config.num_choices = self.num_choices
     model = AlbertForMultipleChoice(config=config)
     model.to(torch_device)
     model.eval()
     multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(
         -1, self.num_choices, -1).contiguous()
     multiple_choice_token_type_ids = token_type_ids.unsqueeze(
         1).expand(-1, self.num_choices, -1).contiguous()
     multiple_choice_input_mask = input_mask.unsqueeze(1).expand(
         -1, self.num_choices, -1).contiguous()
     loss, logits = model(
         multiple_choice_inputs_ids,
         attention_mask=multiple_choice_input_mask,
         token_type_ids=multiple_choice_token_type_ids,
         labels=choice_labels,
     )
     result = {
         "loss": loss,
         "logits": logits,
     }
     self.parent.assertListEqual(list(result["logits"].size()),
                                 [self.batch_size, self.num_choices])
     self.check_loss_output(result)
 def create_and_check_for_multiple_choice(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     config.num_choices = self.num_choices
     model = AlbertForMultipleChoice(config=config)
     model.to(torch_device)
     model.eval()
     multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
     multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
     multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
     result = model(
         multiple_choice_inputs_ids,
         attention_mask=multiple_choice_input_mask,
         token_type_ids=multiple_choice_token_type_ids,
         labels=choice_labels,
     )
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
Exemple #3
0
    def __init__(
            self,
            pretrained_model: str = 'albert-large-uncased',
            learning_rate: float = 2e-5,
            gradient_accumulation_steps: int = 1,
            num_train_epochs: float = 3.0,
            train_batch_size: int = 32,
            warmup_proportion: float = 0.1,
            attention_window: int = 128,
            train_all: bool = False,
            use_bert_adam: bool = True,
            use_longformer: bool = False,
    ):
        super().__init__()
        self.config = AlbertConfig.from_pretrained(pretrained_model, num_choices=4)
        self.model = AlbertForMultipleChoice.from_pretrained(pretrained_model, config=self.config)

        if not train_all:
            for param in self.model.bert.parameters():
                param.requires_grad = False
            for param in self.model.bert.pooler.parameters():
                param.requires_grad = True
            # for param in self.model.bert.encoder.layer[15:24].parameters():
            #     param.requires_grad = True
            # for param in self.model.bert.encoder.layer[15].output.parameters():
            #     param.requires_grad = True

        if use_longformer:
            current_max_pos, embed_size = self.model.bert.embeddings.position_embeddings.weight.shape
            max_pos = 512
            self.config.max_position_embeddings = max_pos
            assert max_pos >= current_max_pos
            # allocate a larger position embedding matrix
            new_pos_embed = self.model.bert.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size)
            print(new_pos_embed.shape)
            print(self.model.bert.embeddings.position_embeddings)
            # copy position embeddings over and over to initialize the new position embeddings
            k = 0
            step = current_max_pos
            while k < max_pos - 1:
                new_pos_embed[k:(k + step)] = self.model.bert.embeddings.position_embeddings.weight
                k += step
            print(new_pos_embed.shape)
            self.model.bert.embeddings.position_embeddings.weight.data = new_pos_embed
            self.model.bert.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)
            # model.bert.embeddings.position_ids = torch.from_numpy(
            #     tf.range(new_pos_embed.shape[0], dtype=tf.int32).numpy()[tf.newaxis, :])
            # model.bert.embeddings.position_embeddings = torch.nn.Embedding.from_pretrained(new_pos_embed)

            # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
            self.config.attention_window = [attention_window] * self.config.num_hidden_layers
            for i, layer in enumerate(self.model.bert.encoder.layer):
                longformer_self_attn = BertLongAttention(self.config, layer_id=i)
                longformer_self_attn.query = layer.attention.self.query
                longformer_self_attn.key = layer.attention.self.key
                longformer_self_attn.value = layer.attention.self.value

                longformer_self_attn.query_global = layer.attention.self.query
                longformer_self_attn.key_global = layer.attention.self.key
                longformer_self_attn.value_global = layer.attention.self.value

                layer.attention.self = longformer_self_attn
            # self.config.attention_window = [attention_window] * self.config.num_hidden_layers
            # for i, layer in enumerate(self.model.bert.encoder.layer):
            #     # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
            #     layer.attention.self = BertLongSelfAttention(self.config, layer_id=i)

        # print model layers and config
        print(self.config)
        for name, params in self.model.named_parameters():
            print('-->name:', name, '-->grad_require:', params.requires_grad)

        self.learning_rate = learning_rate
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.num_train_epochs = num_train_epochs
        self.train_batch_size = train_batch_size
        self.warmup_proportion = warmup_proportion
        self.use_bert_adam = use_bert_adam

        self.warmup_steps = 0
        self.total_steps = 0
def main(args):
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/train.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    # Set the seed value all over the place to make this reproducible.
    seed_val = args.seed
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    # Choose device
    device = get_default_device()

    with open(args.train_data_path) as f:
        train_data = json.load(f)

    albert_xxlarge = "albert-xxlarge-v2"
    tokenizer = AlbertTokenizer.from_pretrained(albert_xxlarge,
                                                do_lower_case=True)

    labels = []
    input_ids = []
    token_type_ids = []
    count = 0

    for item in train_data:
        context = item["context"]
        question = item["question"]
        lab = item["label"]
        labels.append(lab)
        four_inp_ids = []
        four_tok_type_ids = []
        for i, ans in enumerate(item["answers"]):
            combo = context + " [SEP] " + question + " " + ans
            inp_ids = tokenizer.encode(combo)
            # 3 is the [SEP] token for ALBERT
            tok_type_ids = [
                0 if i <= inp_ids.index(3) else 1 for i in range(len(inp_ids))
            ]
            four_inp_ids.append(inp_ids)
            four_tok_type_ids.append(tok_type_ids)
        four_inp_ids = pad_sequences(four_inp_ids,
                                     maxlen=MAXLEN,
                                     dtype="long",
                                     value=0,
                                     truncating="post",
                                     padding="post")
        four_tok_type_ids = pad_sequences(four_tok_type_ids,
                                          maxlen=MAXLEN,
                                          dtype="long",
                                          value=0,
                                          truncating="post",
                                          padding="post")
        input_ids.append(four_inp_ids)
        token_type_ids.append(four_tok_type_ids)
    # Create attention masks
    attention_masks = []
    for sen in input_ids:
        sen_attention_masks = []
        for opt in sen:
            att_mask = [int(token_id > 0) for token_id in opt]
            sen_attention_masks.append(att_mask)
        attention_masks.append(sen_attention_masks)
    # Convert to torch tensors
    labels = torch.tensor(labels)
    labels = labels.long().to(device)
    input_ids = torch.tensor(input_ids)
    input_ids = input_ids.long().to(device)
    token_type_ids = torch.tensor(token_type_ids)
    token_type_ids = token_type_ids.long().to(device)
    attention_masks = torch.tensor(attention_masks)
    attention_masks = attention_masks.long().to(device)

    # Create the DataLoader for training set.
    train_data = TensorDataset(input_ids, token_type_ids, attention_masks,
                               labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size)

    model = AlbertForMultipleChoice.from_pretrained(albert_xxlarge).to(device)

    optimizer = AdamW(model.parameters(),
                      lr=args.learning_rate,
                      eps=args.adam_epsilon
                      # weight_decay = 0.01
                      )

    loss_values = []

    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * args.n_epochs
    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0.1 *
                                                total_steps,
                                                num_training_steps=total_steps)

    for epoch in range(args.n_epochs):
        # Perform one full pass over the training set.
        print("")
        print('======== Epoch {:} / {:} ========'.format(
            epoch + 1, args.n_epochs))
        print('Training...')
        # Measure how long the training epoch takes.
        t0 = time.time()
        # Reset the total loss for this epoch.
        total_loss = 0
        model.train()
        model.zero_grad()
        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].to(device)
            b_tok_typ_ids = batch[1].to(device)
            b_att_msks = batch[2].to(device)
            b_labs = batch[3].to(device)
            model.zero_grad()
            outputs = model(input_ids=b_input_ids,
                            attention_mask=b_att_msks,
                            token_type_ids=b_tok_typ_ids,
                            labels=b_labs)
            loss = outputs[0]
            total_loss += loss.item()
            print(loss.item())
            optimizer.zero_grad()
            loss.backward()
            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # if (step+1) % accumulation_steps == 0:
            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()
            # Update the learning rate.
            scheduler.step()
            # model.zero_grad()
        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)

        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() -
                                                              t0)))

    # Save the model to a file
    file_path = args.save_path + 'albert_QA_MC_seed' + str(args.seed) + '.pt'
    torch.save(model, file_path)