def train(model, train_loader, experiment, hyperparams): print("Training...") optimizer = torch.optim.Adam(model.parameters(), lr=hyperparams["learning_rate"]) loss = nn.CrossEntropyLoss(ignore_index=0) with experiment.train(): for epoch in range(0, hyperparams["num_epochs"]): for (inputs, labels) in tqdm(train_loader): inputs = inputs.to(device) labels = labels.to(device) attention_mask = gen_attention_mask(inputs) input_ids, attention_mask = pad_to_window_size( inputs, attention_mask, longformer_config.attention_window[0], tokenizer.pad_token_id) predictions = model( input_ids.to(device), attention_mask=attention_mask.to(device))[0] labels = torch.flatten(labels) l = loss(predictions, labels.long()) print(" Loss:", l) optimizer.zero_grad() l.backward() optimizer.step()
def evaluate(epoch): print("***** Running evaluating *****") print(" Num examples = {}".format(len(eval_features))) print(" Batch size = {}".format(args.eval_batch_size)) eval_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) eval_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) eval_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) eval_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(eval_input_ids, eval_input_mask, eval_segment_ids, eval_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 step = 0 predict_all = np.array([], dtype=int) labels_all = np.array([], dtype=int) loss_fnt = CrossEntropyLoss() for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc='Evaluation'): step += 1 input_ids, attention_mask = pad_to_window_size(input_ids, input_mask, 512, tokenizer.pad_token_id) input_ids = input_ids.cuda() attention_mask = attention_mask.cuda() label_ids = label_ids.cuda() with torch.no_grad(): logits = model(input_ids=input_ids, attention_mask=attention_mask) loss = loss_fnt(logits, label_ids) eval_loss += loss.mean().item() # 统计一个batch的损失 一个累加下去 labels = label_ids.data.cpu().numpy() predic = torch.max(logits.data, 1)[1].cpu().numpy() labels_all = np.append(labels_all, labels) predict_all = np.append(predict_all, predic) # 损失 召回率 查准率 eval_loss = eval_loss / step eval_accuracy = accuracy_score(labels_all, predict_all) eval_recall = recall_score(labels_all, predict_all) eval_precision = precision_score(labels_all, predict_all) eval_f1_score = f1_score(labels_all, predict_all) s = 'epoch:{}, eval_loss: {}, eval_precision: {}, eval_accuracy:{}, eval_recall:{}, eval_f1_score:{}'.format( epoch, eval_loss, eval_precision, eval_accuracy, eval_recall, eval_f1_score) print(s) s += '\n' with open(args.save_log_file, 'a+') as f: f.write(s) return eval_loss, eval_accuracy
def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions): question_end_index = self._get_question_end_index(input_ids) # Each batch is one document, and each row of the batch is a chunck of the document. # Make sure all rows have the same question length. assert (question_end_index[0].float() == question_end_index.float().mean()).item() # local attention everywhere attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # global attention for the question tokens attention_mask[:, :question_end_index.item()] = 2 # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size input_ids, attention_mask = pad_to_window_size( input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id) sequence_output = self.model( input_ids, attention_mask=attention_mask)[0] # The pretrained TriviaQA model wasn't trained with padding, so remove padding tokens # before computing loss and decoding. padding_len = input_ids[0].eq(self.tokenizer.pad_token_id).sum() if padding_len > 0: sequence_output = sequence_output[:, :-padding_len] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) if not self.args.regular_softmax_loss: # loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf # NOTE: this returns sum of losses, not mean, so loss won't be normalized across different batch sizes # but batch size is always 1, so this is not a problem start_loss = self.or_softmax_cross_entropy_loss_one_doc(start_logits, start_positions, ignore_index=-1) end_loss = self.or_softmax_cross_entropy_loss_one_doc(end_logits, end_positions, ignore_index=-1) else: loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1) start_positions = start_positions[:, 0:1] end_positions = end_positions[:, 0:1] start_loss = loss_fct(start_logits, start_positions[:, 0]) end_loss = loss_fct(end_logits, end_positions[:, 0]) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
def _prepare_input(self, input_ids): attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) attention_mask[input_ids == self.tokenizer.pad_token_id] = 0 if isinstance(self.model, LongformerEncoderDecoderForConditionalGeneration): attention_mask[:, 0] = 2 # global attention on one token for all model params to be used, which is important for gradient checkpointing to work if self.args.attention_mode == 'sliding_chunks': half_padding_mod = self.model.config.attention_window[0] elif self.args.attention_mode == 'sliding_chunks_no_overlap': half_padding_mod = self.model.config.attention_window[0] / 2 else: raise NotImplementedError input_ids, attention_mask = pad_to_window_size( # ideally, should be moved inside the LongformerModel input_ids, attention_mask, half_padding_mod, self.tokenizer.pad_token_id) return input_ids, attention_mask
def forward(self, input_ids, attention_mask, labels=None): input_ids, attention_mask = pad_to_window_size( input_ids, attention_mask, self.model_config.attention_window[0], self.tokenizer.pad_token_id) attention_mask[:, 0] = 2 # global attention for the first token #use Bert inner Pooler output = self.model(input_ids, attention_mask=attention_mask)[1] # pool the entire sequence into one vector (CLS token) # output = output[:, 0, :] logits = self.classifier(output) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.hparams.num_labels), labels.view(-1)) return logits, loss
def test_something(self): config = LongformerConfig.from_pretrained(self.model_dir) # choose the attention mode 'n2', 'tvm' or 'sliding_chunks' # 'n2': for regular n2 attantion # 'tvm': a custom CUDA kernel implementation of our sliding window attention # 'sliding_chunks': a PyTorch implementation of our sliding window attention config.attention_mode = 'sliding_chunks' model = Longformer.from_pretrained(self.model_dir, config=config) tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer.model_max_length = model.config.max_position_embeddings SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze( 0) # batch of size 1 # TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'` # model = model.cuda(); input_ids = input_ids.cuda() # Attention mask values -- 0: no attention, 1: local attention, 2: global attention attention_mask = torch.ones( input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention attention_mask[:, [ 1, 4, 21, ]] = 2 # Set global attention based on the task. For example, # classification: the <s> token # QA: question tokens # padding seqlen to the nearest multiple of 512. Needed for the 'sliding_chunks' attention input_ids, attention_mask = pad_to_window_size( input_ids, attention_mask, config.attention_window[0], tokenizer.pad_token_id) output = model(input_ids, attention_mask=attention_mask)[0] # could have done more here.... self.assertIsNotNone(output)
def forward(self, input_ids, attention_mask, segment_ids, start_positions, end_positions, answer_token_ids): batch_size = input_ids.shape[0] input_ids = input_ids.view( batch_size * (self.current_interaction_num + 1), -1) attention_mask = attention_mask.view( batch_size * (self.current_interaction_num + 1), -1) question_end_index = self._get_question_end_index(input_ids) # Each batch is one document, and each row of the batch is a chunck of the document. # Make sure all rows have the same question length. # assert (question_end_index[0].float() == question_end_index.float().mean()).item() # local attention everywhere, global attention on question tri = torch.tril(torch.ones([input_ids.shape[1], input_ids.shape[1]], dtype=torch.long, device=input_ids.device), diagonal=-1) attention_mask = tri[question_end_index] + 1 # sliding_chunks implemenation of selfattention requires that seqlen is multiple of window size input_ids, attention_mask = pad_to_window_size( input_ids, attention_mask, self.args.attention_window, self.tokenizer.pad_token_id) sequence_output = self.model.forward(input_ids, attention_mask=attention_mask)[0] sequence_output = sequence_output.view( batch_size, self.current_interaction_num + 1, sequence_output.shape[1], -1) p = (0, 0, 0, 0, 0, self.max_num_of_interactions - self.current_interaction_num) sequence_output = torch.nn.functional.pad(sequence_output, p).permute(0, 2, 3, 1) weighted_sum = self.learned_weighted_sum(sequence_output) weighted_sum.squeeze_(-1) logits = self.qa_outputs(weighted_sum) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = ( start_logits, end_logits, ) if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # NOTE: this model predicts start and end index in the *original* question + context encoding. if not self.args.regular_softmax_loss: # loss function suggested in section 2.2 here https://arxiv.org/pdf/1710.10723.pdf # NOTE: this returns sum of losses, not mean, so loss won't be normalized across different batch sizes # but batch size is always 1, so this is not a problem start_loss = self.or_softmax_cross_entropy_loss_one_doc( start_logits, start_positions, ignore_index=-1) end_loss = self.or_softmax_cross_entropy_loss_one_doc( end_logits, end_positions, ignore_index=-1) else: loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1) start_positions = start_positions[:, 0:1] end_positions = end_positions[:, 0:1] start_loss = loss_fct(start_logits, start_positions[:, 0]) end_loss = loss_fct(end_logits, end_positions[:, 0]) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss, ) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
config.attention_mode = 'tvm' #config.attention_mode = 'sliding_chunks' model = Longformer.from_pretrained('downloads/longformer-base-4096/', config=config) tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer.model_max_length = model.config.max_position_embeddings SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1 # TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'` model = model.cuda(); input_ids = input_ids.cuda() # Attention mask values -- 0: no attention, 1: local attention, 2: global attention attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example, # classification: the <s> token # QA: question tokens # padding seqlen to the nearest multiple of 512. Needed for the 'sliding_chunks' attention input_ids, attention_mask = pad_to_window_size( input_ids, attention_mask, config.attention_window[0], tokenizer.pad_token_id) output = model(input_ids, attention_mask=attention_mask)[0] print(output.shape) for ele in output: print(ele[:100]) break
dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) model.train() loss_fnt = CrossEntropyLoss() for epoch in range(args.num_train_epochs): train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size) for step, batch in enumerate(train_dataloader): start_time = time.time() input_ids, input_mask, segment_ids, labels_ids = batch input_ids, attention_mask = pad_to_window_size( input_ids, input_mask, 512, tokenizer.pad_token_id) input_ids = input_ids.cuda() attention_mask = attention_mask.cuda() labels_ids = labels_ids.cuda() logits = model(input_ids=input_ids, attention_mask=attention_mask) loss = loss_fnt(logits, labels_ids) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps print( 'longformer-model*****epoch:{}, step:{}, loss:{:10f}, time_cost:{:10f}' .format(epoch, step, loss, time.time() - start_time)) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward()