Beispiel #1
0
def train(model, train_loader, experiment, hyperparams):
    print("Training...")
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=hyperparams["learning_rate"])
    loss = nn.CrossEntropyLoss(ignore_index=0)

    with experiment.train():
        for epoch in range(0, hyperparams["num_epochs"]):
            for (inputs, labels) in tqdm(train_loader):
                inputs = inputs.to(device)
                labels = labels.to(device)

                attention_mask = gen_attention_mask(inputs)
                input_ids, attention_mask = pad_to_window_size(
                    inputs, attention_mask,
                    longformer_config.attention_window[0],
                    tokenizer.pad_token_id)

                predictions = model(
                    input_ids.to(device),
                    attention_mask=attention_mask.to(device))[0]

                labels = torch.flatten(labels)

                l = loss(predictions, labels.long())
                print(" Loss:", l)

                optimizer.zero_grad()
                l.backward()
                optimizer.step()
Beispiel #2
0
def evaluate(epoch):
    print("***** Running evaluating *****")
    print("  Num examples = {}".format(len(eval_features)))
    print("  Batch size = {}".format(args.eval_batch_size))

    eval_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                  dtype=torch.long)
    eval_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                   dtype=torch.long)
    eval_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                    dtype=torch.long)
    eval_label_ids = torch.tensor([f.label_id for f in eval_features],
                                  dtype=torch.long)

    eval_data = TensorDataset(eval_input_ids, eval_input_mask,
                              eval_segment_ids, eval_label_ids)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)
    model.eval()
    eval_loss = 0
    step = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    loss_fnt = CrossEntropyLoss()
    for input_ids, input_mask, segment_ids, label_ids in tqdm(
            eval_dataloader, desc='Evaluation'):
        step += 1
        input_ids, attention_mask = pad_to_window_size(input_ids, input_mask,
                                                       512,
                                                       tokenizer.pad_token_id)
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()
        label_ids = label_ids.cuda()
        with torch.no_grad():
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fnt(logits, label_ids)
        eval_loss += loss.mean().item()  # 统计一个batch的损失 一个累加下去

        labels = label_ids.data.cpu().numpy()
        predic = torch.max(logits.data, 1)[1].cpu().numpy()
        labels_all = np.append(labels_all, labels)
        predict_all = np.append(predict_all, predic)

    # 损失 召回率 查准率
    eval_loss = eval_loss / step
    eval_accuracy = accuracy_score(labels_all, predict_all)
    eval_recall = recall_score(labels_all, predict_all)
    eval_precision = precision_score(labels_all, predict_all)
    eval_f1_score = f1_score(labels_all, predict_all)

    s = 'epoch:{}, eval_loss: {}, eval_precision: {}, eval_accuracy:{}, eval_recall:{}, eval_f1_score:{}'.format(
        epoch, eval_loss, eval_precision, eval_accuracy, eval_recall,
        eval_f1_score)
    print(s)
    s += '\n'
    with open(args.save_log_file, 'a+') as f:
        f.write(s)
    return eval_loss, eval_accuracy
Beispiel #3
0
    def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions):
        question_end_index = self._get_question_end_index(input_ids)
        # Each batch is one document, and each row of the batch is a chunck of the document.
        # Make sure all rows have the same question length.
        assert (question_end_index[0].float() == question_end_index.float().mean()).item()

        # local attention everywhere
        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
        # global attention for the question tokens
        attention_mask[:, :question_end_index.item()] = 2

        # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size
        input_ids, attention_mask = pad_to_window_size(
            input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id)

        sequence_output = self.model(
                input_ids,
                attention_mask=attention_mask)[0]

        # The pretrained TriviaQA model wasn't trained with padding, so remove padding tokens
        # before computing loss and decoding.
        padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum()
        if padding_len > 0:
            sequence_output = sequence_output[:, :-padding_len]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        outputs = (start_logits, end_logits,)
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            if not self.args.regular_softmax_loss:
                # loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf
                # NOTE: this returns sum of losses, not mean, so loss won't be normalized across different batch sizes
                # but batch size is always 1, so this is not a problem
                start_loss = self.or_softmax_cross_entropy_loss_one_doc(start_logits, start_positions, ignore_index=-1)
                end_loss = self.or_softmax_cross_entropy_loss_one_doc(end_logits, end_positions, ignore_index=-1)
            else:
                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1)
                start_positions = start_positions[:, 0:1]
                end_positions = end_positions[:, 0:1]
                start_loss = loss_fct(start_logits, start_positions[:, 0])
                end_loss = loss_fct(end_logits, end_positions[:, 0])

            total_loss = (start_loss + end_loss) / 2
            outputs = (total_loss,) + outputs

        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
 def _prepare_input(self, input_ids):
     attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
     attention_mask[input_ids == self.tokenizer.pad_token_id] = 0
     if isinstance(self.model, LongformerEncoderDecoderForConditionalGeneration):
         attention_mask[:, 0] = 2  # global attention on one token for all model params to be used, which is important for gradient checkpointing to work
         if self.args.attention_mode == 'sliding_chunks':
             half_padding_mod = self.model.config.attention_window[0]
         elif self.args.attention_mode == 'sliding_chunks_no_overlap':
             half_padding_mod = self.model.config.attention_window[0] / 2
         else:
             raise NotImplementedError
         input_ids, attention_mask = pad_to_window_size(  # ideally, should be moved inside the LongformerModel
             input_ids, attention_mask, half_padding_mod, self.tokenizer.pad_token_id)
     return input_ids, attention_mask
    def forward(self, input_ids, attention_mask, labels=None):
        input_ids, attention_mask = pad_to_window_size(
            input_ids, attention_mask, self.model_config.attention_window[0],
            self.tokenizer.pad_token_id)
        attention_mask[:, 0] = 2  # global attention for the first token
        #use Bert inner Pooler
        output = self.model(input_ids, attention_mask=attention_mask)[1]
        # pool the entire sequence into one vector (CLS token)
        # output = output[:, 0, :]
        logits = self.classifier(output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()

            loss = loss_fct(logits.view(-1, self.hparams.num_labels),
                            labels.view(-1))

        return logits, loss
Beispiel #6
0
    def test_something(self):
        config = LongformerConfig.from_pretrained(self.model_dir)
        # choose the attention mode 'n2', 'tvm' or 'sliding_chunks'
        # 'n2': for regular n2 attantion
        # 'tvm': a custom CUDA kernel implementation of our sliding window attention
        # 'sliding_chunks': a PyTorch implementation of our sliding window attention
        config.attention_mode = 'sliding_chunks'

        model = Longformer.from_pretrained(self.model_dir, config=config)
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        tokenizer.model_max_length = model.config.max_position_embeddings

        SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document

        input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(
            0)  # batch of size 1

        # TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'`
        # model = model.cuda(); input_ids = input_ids.cuda()

        # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
        attention_mask = torch.ones(
            input_ids.shape, dtype=torch.long,
            device=input_ids.device)  # initialize to local attention
        attention_mask[:, [
            1,
            4,
            21,
        ]] = 2  # Set global attention based on the task. For example,
        # classification: the <s> token
        # QA: question tokens

        # padding seqlen to the nearest multiple of 512. Needed for the 'sliding_chunks' attention
        input_ids, attention_mask = pad_to_window_size(
            input_ids, attention_mask, config.attention_window[0],
            tokenizer.pad_token_id)

        output = model(input_ids, attention_mask=attention_mask)[0]

        # could have done more here....
        self.assertIsNotNone(output)
    def forward(self, input_ids, attention_mask, segment_ids, start_positions,
                end_positions, answer_token_ids):

        batch_size = input_ids.shape[0]
        input_ids = input_ids.view(
            batch_size * (self.current_interaction_num + 1), -1)
        attention_mask = attention_mask.view(
            batch_size * (self.current_interaction_num + 1), -1)
        question_end_index = self._get_question_end_index(input_ids)
        # Each batch is one document, and each row of the batch is a chunck of the document.
        # Make sure all rows have the same question length.
        # assert (question_end_index[0].float() == question_end_index.float().mean()).item()
        # local attention everywhere, global attention on question
        tri = torch.tril(torch.ones([input_ids.shape[1], input_ids.shape[1]],
                                    dtype=torch.long,
                                    device=input_ids.device),
                         diagonal=-1)
        attention_mask = tri[question_end_index] + 1

        # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size
        input_ids, attention_mask = pad_to_window_size(
            input_ids, attention_mask, self.args.attention_window,
            self.tokenizer.pad_token_id)
        sequence_output = self.model.forward(input_ids,
                                             attention_mask=attention_mask)[0]
        sequence_output = sequence_output.view(
            batch_size, self.current_interaction_num + 1,
            sequence_output.shape[1], -1)
        p = (0, 0, 0, 0, 0,
             self.max_num_of_interactions - self.current_interaction_num)
        sequence_output = torch.nn.functional.pad(sequence_output,
                                                  p).permute(0, 2, 3, 1)
        weighted_sum = self.learned_weighted_sum(sequence_output)
        weighted_sum.squeeze_(-1)
        logits = self.qa_outputs(weighted_sum)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        outputs = (
            start_logits,
            end_logits,
        )
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # NOTE: this model predicts start and end index in the *original* question + context encoding.
            if not self.args.regular_softmax_loss:
                # loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf
                # NOTE: this returns sum of losses, not mean, so loss won't be normalized across different batch sizes
                # but batch size is always 1, so this is not a problem
                start_loss = self.or_softmax_cross_entropy_loss_one_doc(
                    start_logits, start_positions, ignore_index=-1)
                end_loss = self.or_softmax_cross_entropy_loss_one_doc(
                    end_logits, end_positions, ignore_index=-1)
            else:
                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1)
                start_positions = start_positions[:, 0:1]
                end_positions = end_positions[:, 0:1]
                start_loss = loss_fct(start_logits, start_positions[:, 0])
                end_loss = loss_fct(end_logits, end_positions[:, 0])

            total_loss = (start_loss + end_loss) / 2
            outputs = (total_loss, ) + outputs
        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
Beispiel #8
0
config.attention_mode = 'tvm'
#config.attention_mode = 'sliding_chunks'

model = Longformer.from_pretrained('downloads/longformer-base-4096/', config=config)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.model_max_length = model.config.max_position_embeddings

SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document

input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1

# TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'`
model = model.cuda(); input_ids = input_ids.cuda()

# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
attention_mask[:, [1, 4, 21,]] =  2  # Set global attention based on the task. For example,
                                     # classification: the <s> token
                                     # QA: question tokens

# padding seqlen to the nearest multiple of 512. Needed for the 'sliding_chunks' attention
input_ids, attention_mask = pad_to_window_size(
        input_ids, attention_mask, config.attention_window[0], tokenizer.pad_token_id)

output = model(input_ids, attention_mask=attention_mask)[0]

print(output.shape)
for ele in output:
    print(ele[:100])
    break
Beispiel #9
0
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)

        model.train()
        loss_fnt = CrossEntropyLoss()
        for epoch in range(args.num_train_epochs):
            train_dataloader = DataLoader(train_data,
                                          shuffle=True,
                                          batch_size=args.train_batch_size)
            for step, batch in enumerate(train_dataloader):
                start_time = time.time()
                input_ids, input_mask, segment_ids, labels_ids = batch
                input_ids, attention_mask = pad_to_window_size(
                    input_ids, input_mask, 512, tokenizer.pad_token_id)
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                labels_ids = labels_ids.cuda()
                logits = model(input_ids=input_ids,
                               attention_mask=attention_mask)
                loss = loss_fnt(logits, labels_ids)
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                print(
                    'longformer-model*****epoch:{}, step:{}, loss:{:10f}, time_cost:{:10f}'
                    .format(epoch, step, loss,
                            time.time() - start_time))
                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()