def merge_source(left_pad, move_eos_to_beginning=False): # src = [s['source'] for s in samples] assert samples[0]['source'] is not None src = { k: [dic['source'][k] for dic in samples] for k in samples[0]['source'] } nodes = src['nodes'] labels = src['labels'] indices = src['indices'] length = src['length'] nodes = data_utils.collate_tokens(nodes, pad_idx, eos_idx, left_pad, move_eos_to_beginning) labels = data_utils.collate_tokens(labels, pad_idx, eos_idx, left_pad, move_eos_to_beginning) indices = dptree2seq_collate_indices(indices, 0, 0, left_pad, move_eos_to_beginning) length = torch.cat([x.unsqueeze_(0) for x in length], 0) src_o = { 'nodes': nodes, 'labels': labels, 'indices': indices, 'length': length } return src_o
def evaluate_aspect(self): assert 'val' in self._aspect_dataset self._bart.set_mode('train') self._bart.eval() loss_list = [] for i in range(0, len(self._aspect_dataset['val']), LIL_BATCH_SIZE): batch = self._aspect_dataset['val'][i:i + LIL_BATCH_SIZE] src_lengths = torch.tensor([len(t.src_tokens) for t in batch]) src_tokens = collate_tokens([t.src_tokens for t in batch], pad_idx=self._bart.dictionary.pad()) tgt_tokens = collate_tokens([t.tgt_tokens for t in batch], pad_idx=self._bart.dictionary.pad()) tgt_labels = collate_tokens([t.tgt_labels for t in batch], pad_idx=self._bart.pad_label_index) with torch.no_grad(): seq2seq_loss, seqlab_loss = self._get_both_loss( src_lengths=src_lengths, src_tokens=src_tokens, tgt_tokens=tgt_tokens, tgt_labels=tgt_labels) loss = seq2seq_loss + seqlab_loss loss_list.append(loss.item()) return sum(loss_list) / len(loss_list)
def evaluate(self): assert 'dev' in self._dataset self._model.split_to_gpus(n_gpus=1) self._model.eval() loss_list = [] for i in trange(0, len(self._dataset['dev']), LIL_BATCH_SIZE, desc='Evaluating on Dev Set'): batch = self._dataset['dev'][i:i + LIL_BATCH_SIZE] src_lengths = torch.tensor([len(t.src_tokens) for t in batch]) src_tokens = collate_tokens([t.src_tokens for t in batch], pad_idx=self._model.dictionary.pad()) tgt_tokens = collate_tokens([t.tgt_tokens for t in batch], pad_idx=self._model.dictionary.pad()) with torch.no_grad(): loss = self._get_label_smoothed_nll_loss( src_lengths=src_lengths, src_tokens=src_tokens, tgt_tokens=tgt_tokens, epsilon=0.) loss_list.append(loss.item()) return sum(loss_list) / len(loss_list)
def collate(self, samples): """ utility function to collate samples into batch for speech recognition. """ if len(samples) == 0: return {} id = torch.LongTensor([s["id"] for s in samples]) source = data_utils.collate_tokens([s["source"] for s in samples], self.pad_index, eos_idx=self.eos_index) target = data_utils.collate_tokens([s["target"] for s in samples], self.pad_index, eos_idx=self.eos_index) prev_output_tokens = data_utils.collate_tokens( [s["target"] for s in samples], self.pad_index, self.eos_index, left_pad=False, move_eos_to_beginning=True, ) # print("tgt ",target[0]) # print("prev ",prev_output_tokens[0]) batch = { "id": id, "ntokens": sum(len(s["target"]) for s in samples), "net_input": {"src_tokens": source, "src_lengths": torch.LongTensor([s.size(0) for s in source]), "prev_output_tokens":prev_output_tokens}, "target": target, "nsentences": len(samples), } return batch
def _collate_target( self, samples: List[SpeechToSpeechDatasetItem]) -> torch.Tensor: if self.target_is_code: target = fairseq_data_utils.collate_tokens( [x.target for x in samples], self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, move_eos_to_beginning=False, ) # convert stacked units to a single id pack_targets = [self.pack_units(x.target) for x in samples] prev_output_tokens = fairseq_data_utils.collate_tokens( pack_targets, self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, move_eos_to_beginning=True, ) target_lengths = torch.tensor([x.size(0) for x in pack_targets], dtype=torch.long) else: target = _collate_frames([x.target for x in samples], is_audio_input=False) bsz, _, d = target.size() prev_output_tokens = torch.cat((target.new_full( (bsz, 1, d), 0.0), target[:, :-1, :]), dim=1) target_lengths = torch.tensor([x.target.size(0) for x in samples], dtype=torch.long) return target, prev_output_tokens, target_lengths
def collater(self, samples: List[torch.Tensor]) -> torch.Tensor: out = fairseq_data_utils.collate_tokens( samples, self.dict.pad(), self.dict.eos(), left_pad=False, move_eos_to_beginning=False, ).long() prev_out = fairseq_data_utils.collate_tokens( samples, self.dict.pad(), self.dict.eos(), left_pad=False, move_eos_to_beginning=True, ).long() target_lengths = torch.tensor([t.size(0) for t in samples], dtype=torch.long) ntokens = sum(t.size(0) for t in samples) output = { "prev_output_tokens": prev_out, "target": out, "target_lengths": target_lengths, "ntokens": ntokens, } return output
def _prepare_batch_for_alignment(self, sample, hypothesis): src_tokens = sample["net_input"]["src_tokens"] bsz = src_tokens.shape[0] src_tokens = (src_tokens[:, None, :].expand(-1, self.beam_size, -1).contiguous().view( bsz * self.beam_size, -1)) src_lengths = sample["net_input"]["src_lengths"] src_lengths = (src_lengths[:, None].expand( -1, self.beam_size).contiguous().view(bsz * self.beam_size)) prev_output_tokens = data_utils.collate_tokens( [beam["tokens"] for example in hypothesis for beam in example], self.pad, self.eos, self.left_pad_target, move_eos_to_beginning=True, ) tgt_tokens = data_utils.collate_tokens( [beam["tokens"] for example in hypothesis for beam in example], self.pad, self.eos, self.left_pad_target, move_eos_to_beginning=False, ) return src_tokens, src_lengths, prev_output_tokens, tgt_tokens
def inference_step(self, model: nn.Module, previous_level_tokens: List[List[int]], previous_level_heads: List[List[int]], expansion_sampling: Callable[[torch.Tensor], torch.LongTensor] = None, given_next_level_expansions: List[List[int]] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]: """ :param model: :param previous_level_tokens: :param previous_level_heads: :param expansion_sampling: :param given_next_level_expansions: :return: Tuples of token logits, token probabilities, expansion logits and expansion probabilities. """ prev_tokens = collate_tokens([torch.LongTensor(p) for p in previous_level_tokens], self.pad_idx, eos_idx=None, left_pad=False) # Index -1 is used for the root node. We shift positions so that # the original -1 is now pad_idx + 1 head_pos_shift = self.pad_idx + 2 head_positions = collate_tokens([torch.LongTensor(heads) + head_pos_shift for heads in previous_level_heads], self.pad_idx, eos_idx=None, left_pad=False) previous_level_dependency_masks = [heads2causality_mask(heads) for heads in previous_level_heads] prev_causality_masks = collate_masks( [1 - torch.ByteTensor(m).permute(1, 0) for m in previous_level_dependency_masks], self.pad_idx) next_level_expansions = (None if given_next_level_expansions is None else collate_tokens([torch.LongTensor(n) for n in given_next_level_expansions], self.pad_idx, eos_idx=None, left_pad=False).to(self.device)) net_input = { KEY_PREV_LEVEL_TOKENS: prev_tokens.to(self.device), KEY_CAUSALITY_MASK: prev_causality_masks.to(self.device), KEY_HEAD_POSITIONS: head_positions.to(self.device), KEY_NEXT_LEVEL_EXPANS: next_level_expansions, 'expansion_sampling': expansion_sampling, } token_logits, expansion_logits, expansion_ids = model(**net_input) return token_logits, expansion_logits, expansion_ids
def collater(self, samples): """Merge a list of samples to form a mini-batch.""" if len(samples) == 0: return {} indices = [] source_imgs = [] source_texts = [] source_lengths = [] targets = [] target_ntokens = 0 for sample in samples: index = sample['id'] indices.append(index) source_imgs.append(sample['source_imgs']) source_texts.append(sample['source_texts']) source_lengths.append(len(sample['source_texts'])) targets.append(sample['target']) target_ntokens += len(sample["target"]) num_sentences = len(samples) indices = torch.tensor(indices, dtype=torch.long) max_sent = max(x.size(0) for x in source_imgs) pad_imgs = torch.zeros([num_sentences, max_sent, self.img_dataset.dim], dtype=torch.float) for idx, imgs in enumerate(source_imgs): pad_imgs[idx][: imgs.size(0)] = imgs source_texts_batch = data_utils.collate_tokens(source_texts, pad_idx=self.vocab_dict.pad(), eos_idx=self.vocab_dict.eos(), move_eos_to_beginning=False) target_batch = data_utils.collate_tokens(targets, pad_idx=self.vocab_dict.pad(), eos_idx=self.vocab_dict.eos(), move_eos_to_beginning=False) prev_target_batch = data_utils.collate_tokens(targets, pad_idx=self.vocab_dict.pad(), eos_idx=self.vocab_dict.eos(), move_eos_to_beginning=True) return { 'id': indices, 'net_input': { 'src_tokens': source_texts_batch, 'src_imgs': pad_imgs, 'src_lengths': source_lengths, 'prev_output_tokens': prev_target_batch, }, 'target': target_batch, 'ntokens': target_ntokens, 'nsentences': num_sentences, }
def collater(self, samples): indices = [] source_feature_samples = [] source_location_samples = [] source_lengths = [] target_samples = [] target_ntokens = 0 for sample in samples: index = sample['id'] indices.append(index) source_feature_samples.append(sample['source_features']) source_location_samples.append(sample['source_locations']) source_lengths.append(self.img_ds.sizes[index]) target_samples.append(sample['target']) target_ntokens += self.cap_ds.sizes[index] num_sentences = len(samples) # FIXME: workaround for edge case in parallel processing # (framework passes empty samples list # to collater under certain conditions) if num_sentences == 0: return None indices = torch.tensor(indices, dtype=torch.long) source_feature_batch, source_location_batch = \ self.img_ds.collater(list(zip(source_feature_samples, source_location_samples))) target_batch = data_utils.collate_tokens(target_samples, pad_idx=self.cap_dict.pad(), eos_idx=self.cap_dict.eos(), move_eos_to_beginning=False) rotate_batch = data_utils.collate_tokens(target_samples, pad_idx=self.cap_dict.pad(), eos_idx=self.cap_dict.eos(), move_eos_to_beginning=True) return { 'id': indices, 'net_input': { 'src_tokens': source_feature_batch, 'src_locations': source_location_batch, 'src_lengths': source_lengths, 'prev_output_tokens': rotate_batch, }, 'target': target_batch, 'ntokens': target_ntokens, 'nsentences': num_sentences, }
def collater( self, samples: List[SpeechToTextDatasetItem], return_order: bool = False ) -> Dict: if len(samples) == 0: return {} indices = torch.tensor([x.index for x in samples], dtype=torch.long) frames = _collate_frames([x.source for x in samples], self.cfg.use_audio_input) # sort samples by descending number of frames n_frames = torch.tensor([x.source.size()[0] for x in samples], dtype=torch.long) n_frames, order = n_frames.sort(descending=True) indices = indices.index_select(0, order) frames = frames.index_select(0, order) target, target_lengths = None, None prev_output_tokens = None ntokens = None if self.tgt_texts is not None: target = fairseq_data_utils.collate_tokens( [x.target for x in samples], self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, move_eos_to_beginning=False, ) target = target.index_select(0, order) target_lengths = torch.tensor( [x.target.size()[0] for x in samples], dtype=torch.long ).index_select(0, order) prev_output_tokens = fairseq_data_utils.collate_tokens( [x.target for x in samples], self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, move_eos_to_beginning=True, ) prev_output_tokens = prev_output_tokens.index_select(0, order) ntokens = sum(x.target.size()[0] for x in samples) net_input = { "src_tokens": frames, "src_lengths": n_frames, "prev_output_tokens": prev_output_tokens, } out = { "id": indices, "net_input": net_input, "target": target, "target_lengths": target_lengths, "ntokens": ntokens, "nsentences": len(samples), } if return_order: out["order"] = order return out
def fair_seq_no_neutral_sent_pair_classification(sentence_pairs, model, gpu_available, logger): if gpu_available: model.cuda() logger.info("successfully moved model to gpu") model.eval() avg_responses = {} counter = 0 for key, corr_incorr_pair in sentence_pairs.items(): avg_responses[key] = { 'correct': { 'label_list': [], 'avg_accuracy': -1 }, 'incorrect': { 'label_list': [], 'avg_accuracy': -1 } } # Correct pair (true label: entailment) results batch = collate_tokens([ model.encode(pair[0], pair[1]) for pair in corr_incorr_pair['correct'] ], pad_idx=1) logprobs = model.predict('sentence_classification_head', batch) result_list = logprobs.argmax(dim=1).tolist() avg_accuracy = result_list.count(1) / len(result_list) avg_responses[key]['correct']['label_list'] = result_list avg_responses[key]['correct']['avg_accuracy'] = avg_accuracy # Incorrect pair (true label: contradiction) results batch = collate_tokens([ model.encode(pair[0], pair[1]) for pair in corr_incorr_pair['incorrect'] ], pad_idx=1) logprobs = model.predict('sentence_classification_head', batch) result_list = logprobs.argmax(dim=1).tolist() avg_accuracy = result_list.count(0) / len(result_list) avg_responses[key]['incorrect']['label_list'] = result_list avg_responses[key]['incorrect']['avg_accuracy'] = avg_accuracy counter += 1 if counter % 240 == 0: logger.info("finished 10 more") return avg_responses
def collater( self, samples: List[Tuple[int, torch.Tensor, torch.Tensor]]) -> Dict: if len(samples) == 0: return {} indices = torch.tensor([i for i, _, _ in samples], dtype=torch.long) frames = _collate_frames([s for _, s, _ in samples], self.data_cfg.use_audio_input) # sort samples by descending number of frames n_frames = torch.tensor([s.size(0) for _, s, _ in samples], dtype=torch.long) n_frames, order = n_frames.sort(descending=True) indices = indices.index_select(0, order) frames = frames.index_select(0, order) target, target_lengths = None, None prev_output_tokens = None ntokens = None if self.tgt_texts is not None: target = fairseq_data_utils.collate_tokens( [t for _, _, t in samples], self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, move_eos_to_beginning=False, ) target = target.index_select(0, order) target_lengths = torch.tensor([t.size(0) for _, _, t in samples], dtype=torch.long).index_select( 0, order) prev_output_tokens = fairseq_data_utils.collate_tokens( [t for _, _, t in samples], self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, move_eos_to_beginning=True, ) prev_output_tokens = prev_output_tokens.index_select(0, order) ntokens = sum(t.size(0) for _, _, t in samples) out = { "id": indices, "net_input": { "src_tokens": frames, "src_lengths": n_frames, "prev_output_tokens": prev_output_tokens, }, "target": target, "target_lengths": target_lengths, "ntokens": ntokens, "nsentences": len(samples), } return out
def train_epoch(self, batch_size, label_smooth_epsilon): assert 'train' in self._dataset random.shuffle(self._dataset['train']) print_train_loss = 0.0 num = 0 for i in trange(0, len(self._dataset['train']), batch_size, desc='BART Training'): self._model.split_to_gpus(n_gpus=min(2, torch.cuda.device_count())) self._model.train() batch = self._dataset['train'][i:i + batch_size] self._optimizer.zero_grad() for j in range(0, len(batch), LIL_BATCH_SIZE): lil_batch = batch[j:j + LIL_BATCH_SIZE] src_lengths = torch.tensor( [len(t.src_tokens) for t in lil_batch]) src_tokens = collate_tokens( [t.src_tokens for t in lil_batch], pad_idx=self._model.dictionary.pad()) tgt_tokens = collate_tokens( [t.tgt_tokens for t in lil_batch], pad_idx=self._model.dictionary.pad()) loss = self._get_label_smoothed_nll_loss( src_lengths=src_lengths, src_tokens=src_tokens, tgt_tokens=tgt_tokens, epsilon=label_smooth_epsilon) num += 1 loss = loss * len(lil_batch) / batch_size print_train_loss += loss * batch_size if torch.isnan(loss): print('warning: nan loss') print(f'tgt_text: {lil_batch[0].tgt_text}') else: loss.backward() self._optimizer.step() self._lr_scheduler.step() self._global_step += 1 if self._global_step % self._eval_steps == 0: self.gen_log() print("Training loss:", print_train_loss.item() / num)
def default_collater(target_dict, samples, dataset=None): if not samples: return None if any([sample is None for sample in samples]): if not dataset: return None len_batch = len(samples) while True: samples.append(dataset[random.choice(range(len(dataset)))]) samples = list(filter(lambda x: x is not None, samples)) if len(samples) == len_batch: break indices = [] imgs = [] # bs, c, h , w target_samples = [] target_ntokens = 0 for sample in samples: index = sample['id'] indices.append(index) imgs.append(sample['tfm_img']) target_samples.append(sample['label_ids'].long()) target_ntokens += len(sample['label_ids']) num_sentences = len(samples) target_batch = data_utils.collate_tokens(target_samples, pad_idx=target_dict.pad(), eos_idx=target_dict.eos(), move_eos_to_beginning=False) rotate_batch = data_utils.collate_tokens(target_samples, pad_idx=target_dict.pad(), eos_idx=target_dict.eos(), move_eos_to_beginning=True) indices = torch.tensor(indices, dtype=torch.long) imgs = torch.stack(imgs, dim=0) return { 'id': indices, 'net_input': { 'imgs': imgs, 'prev_output_tokens': rotate_batch }, 'ntokens': target_ntokens, 'nsentences': num_sentences, 'target': target_batch }
def get_dummy_input(T=100, D=80, B=5, K=100): forward_input = {} # T max sequence length # D feature vector dimension # B batch size # K target dimension size feature = torch.randn(B, T, D) # this (B, T, D) layout is just a convention, you can override it by # write your own _prepare_forward_input function src_lengths = torch.from_numpy( np.random.randint(low=1, high=T, size=B, dtype=np.int64)) src_lengths[0] = T # make sure the maximum length matches prev_output_tokens = [] for b in range(B): token_length = np.random.randint(low=1, high=src_lengths[b].item() + 1) tokens = np.random.randint(low=0, high=K, size=token_length, dtype=np.int64) prev_output_tokens.append(torch.from_numpy(tokens)) prev_output_tokens = fairseq_data_utils.collate_tokens( prev_output_tokens, pad_idx=1, eos_idx=2, left_pad=False, move_eos_to_beginning=False, ) src_lengths, sorted_order = src_lengths.sort(descending=True) forward_input["src_tokens"] = feature.index_select(0, sorted_order) forward_input["src_lengths"] = src_lengths forward_input["prev_output_tokens"] = prev_output_tokens return forward_input
def merge(key, left_pad): return data_utils.collate_tokens( [s[key] for s in samples], pad_idx, eos_idx, left_pad, )
def collater_seq_label(self, targets, pad): lengths = torch.LongTensor([len(t) for t in targets]) ntokens = lengths.sum().item() targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False) return targets, lengths, ntokens
def _collate_fn(self, items: List[List[torch.Tensor]]): # we don't use fairseq's batching functionality, so we expect a single # Tensor of type List[torch.Tensor] assert len(items) == 1 # item will have shape B x T (the last batch may have length < T) id, item = items[0] item = data_utils.collate_tokens(item, pad_idx=self.source_dictionary.pad()) B, T = item.size() # shift item one position over and append a padding token for the target target = torch.nn.functional.pad(item[:, 1:], (0, 1, 0, 0), value=self.target_dictionary.pad()) # fairseq expects batches to have the following structure return { "id": torch.tensor([id] * item.size(0)), "net_input": { "src_tokens": item, }, "target": target, "nsentences": item.size(0), "ntokens": item.numel(), }
def get_roberta_preds(claim, evidences): batch_of_pairs = [[claim, evidence] for evidence in evidences] # batch_of_pairs = [ # ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'], # ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'], # ['potatoes are awesome.', 'I like to run.'], # ['Mars is very far from earth.', 'Mars is very close.'], # ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'], # ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'], # ['potatoes are awesome.', 'I like to run.'], # ['Mars is very far from earth.', 'Mars is very close.'], # ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'], # ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'] # ] batch = collate_tokens( [roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1 ) logprobs = roberta.predict('mnli', batch) pred_dict = { 0: 'contradiction', 1: 'neutral', 2: 'entailment' } pred_indices = logprobs.argmax(dim=1).tolist() preds = [pred_dict[i] for i in pred_indices] print(preds) return preds
def extract_batch(self, sentence_string_batch): bert_data = {} bert_data["word_features"] = [] bert_data["wordpieces_roberta"] = [] bert_data["word2piece_scattered_indices"] = [] src_wordpieces = [] src_word2piece = [] for sentence in sentence_string_batch: word2piece = get_wordpiece_to_word_map(sentence, self.roberta.bpe) wordpieces_roberta = self.roberta.encode(sentence) wordpieces_roberta = wordpieces_roberta[:512] src_wordpieces.append(copy.deepcopy(wordpieces_roberta)) src_word2piece.append(copy.deepcopy(word2piece)) src_wordpieces_collated = collate_tokens(src_wordpieces, pad_idx=1) roberta_batch_features = self.extract_features(src_wordpieces_collated) roberta_batch_features = roberta_batch_features.detach().cpu() for index,(word2piece, wordpieces_roberta) in enumerate(zip(src_word2piece, src_wordpieces)): roberta_features = roberta_batch_features[index] roberta_features = roberta_features[1:len(wordpieces_roberta)-1] word_features = get_average_embeddings(roberta_features.unsqueeze(0), word2piece) word2piece_scattered_indices = get_scatter_indices(word2piece, reverse=True) bert_data["word_features"].append(word_features[0]) bert_data["wordpieces_roberta"].append(wordpieces_roberta) bert_data["word2piece_scattered_indices"].append(word2piece_scattered_indices) return bert_data
def forward(self, src_tokens, src_lengths, prev_output_tokens): all_sub_layers = self.args.prob_self_attn or self.args.prob_ed_attn or self.args.prob_ed_norm or self.args.prob_ffn prev_output_tokens, target = prev_output_tokens self.nmt_model.eval() with torch.no_grad(): encoder_out = self.nmt_model.encoder(src_tokens, src_lengths) decoder_out = self.nmt_model.decoder(prev_output_tokens, encoder_out, all_sub_layers=all_sub_layers) # prob sub_layers if self.args.prob_self_attn: nmt_out = decoder_out[1]['self_attn'][self.args.prob_layer-1] elif self.args.prob_ed_attn: nmt_out = decoder_out[1]['ed_attn'][self.args.prob_layer-1] elif self.args.prob_ed_norm: nmt_out = decoder_out[1]['ed_norm'][self.args.prob_layer-1] elif self.args.prob_ffn: nmt_out = decoder_out[1]['ffn'][self.args.prob_layer-1] else: nmt_out = decoder_out[1]['inner_states'][self.args.prob_layer] encoder_padding_mask = prev_output_tokens.eq(1) if not encoder_padding_mask.any(): encoder_padding_mask = None prob_input = {'encoder_out': nmt_out, 'encoder_padding_mask': encoder_padding_mask} prob_prev_output_tokens = collate_tokens([s[s.ne(1)] for s in target],1, 2, False, True) return self.prober(prob_prev_output_tokens, prob_input)
def get_MRC_answer(fname, context, all_qas, ori_qas, options): ans = "" answers = [] confidents = [] docs = [] weights = [] for qas in all_qas: ts = [] for qa in qas: inp = model.encode(qa, context) ts.append(inp) batch = collate_tokens(ts, pad_idx=1) # print(model.extract_features_aligned_to_words(qa)) logits, last_attn = model.predict('sentence_classification_head', batch, return_logits=True) logits = torch.nn.functional.softmax(logits.squeeze()) print(last_attn.shape) logits = logits.tolist() logits = np.asarray(logits).flatten() print(logits) answer = np.argmax(logits) confident = logits[answer] print(torch.max(last_attn[answer, 0, :])) print(torch.sum(last_attn[answer, 0, :])) toks, attns = model.extract_attention_to_words(qas[answer], context, last_attn[answer, 0, :].squeeze()) attns = attns.tolist() ans += chr(ord('A') + answer) answers.append(chr(ord('A') + answer)) confidents.append(confident) docs.append(toks) weights.append(attns) extract_word_file(docs, weights, ori_qas, answers, confidents, fname) # response = [] response = ['答案:' + ans, '{}{}'.format(prefix, fname)] return response
def collater(self, samples): if len(samples) == 0: return {} features = [s["features"] for s in samples] sizes = [len(s) for s in features] target_size = max(sizes) collated_features = features[0].new_zeros(len(features), target_size, features[0].size(-1)) padding_mask = torch.BoolTensor( collated_features.shape[:-1]).fill_(False) for i, (f, size) in enumerate(zip(features, sizes)): collated_features[i, :size] = f padding_mask[i, size:] = True res = { "id": torch.LongTensor([s["id"] for s in samples]), "net_input": { "features": collated_features, "padding_mask": padding_mask }, } if len(self.labels) > 0: target = data_utils.collate_tokens( [s["target"] for s in samples], pad_idx=self.label_dict.pad(), left_pad=False, ) res["target"] = target return res
def test_bart_large_mnli(self): with contextlib.redirect_stdout(StringIO()): # Download BART already finetuned for MNLI bart = fb_hub.load('bart.large.mnli') bart.eval() # disable dropout for evaluation # Encode a pair of sentences and make a prediction tokens = bart.encode('BART is a seq2seq model.', 'BART is not sequence to sequence.') prediction = bart.predict('mnli', tokens).argmax().item() self.assertEqual(prediction, 0) # contradiction # Encode another pair of sentences tokens = bart.encode('BART is denoising autoencoder.', 'BART is version of autoencoder.') prediction = bart.predict('mnli', tokens).argmax().item() self.assertEqual(prediction, 2) # entailment # Test batched prediction from fairseq.data.data_utils import collate_tokens batch_of_pairs = [ [ 'BART is a seq2seq model.', 'BART is not sequence to sequence.' ], [ 'BART is denoising autoencoder.', 'BART is version of autoencoder.' ], ] batch = collate_tokens( [bart.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1) logprobs = bart.predict('mnli', batch) self.assertEqual(logprobs.argmax(dim=1).tolist(), [0, 2])
def eval_one_example(): # context = 'I was not very happy. Because he did some bad things to me. But I am fine after he apologized to me.' # qa1 = 'What\'s my mood right now? Pleased' # qa2 = 'What\'s my mood right now? Sad' # qa3 = 'What\'s my mood right now? Angry' # qa4 = 'What\'s my mood right now? Cool' context = 'The Sunset Pasta Cruise to Emerald Bay Saturday evening, September 25, 2010 You will cruise to Emerald Bay at Sunset, one of the most beautiful places in the world while dining on a Pasta Buffet and listening to live light dance music. Buses will pick up Sunset Pasta Cruise diners from the main entrance to the Horizon Casino Resort at: 4:40pm and 5:05pm on Saturday and take you the 1.5 miles to Ski Run Marina for boarding. Boarding is at Ski Run Marina at 5:15 p.m. (with departure at 5:45 p.m.), located in South Lake Tahoe. The cost for the cruise, pasta buffet, live music, and the 2.5-hour cruise to Emerald Bay is $55 (normally $75). The cost for children between 3-11 is $41 and under 3 is free. Must register the under 3 as well for the coast guard count. The Sunset Pasta Cruise will be limited to 200 guests. Large parties will be seated first to insure seating together. Pick up your Sunset Pasta Cruise tickets at the Expo at the Horizon Casino Resort before 3 p.m. on Saturday. Those unclaimed will be sold to those on the waiting list at that time. At approximately 5:45 pm any extra spaces will be sold to passengers on the dock. Children who require a seat must have a ticket as well. Closest lodging to the Pasta Cruise is: Super 8, Lakeland Village. Please note that our sponsor , the Riva Grill, is on the Lake close to the boarding area for the Tahoe Queen. A great gathering place to meet or to have dinner. Call Riva Grill (530) 542-2600 for lunch or dinner reservations while you are visiting Lake Tahoe.' qas = [ 'When will the cruise to Emerald Bay end? At about 7:00 pm.', 'When will the cruise to Emerald Bay end? At about 8:20 pm.', 'When will the cruise to Emerald Bay end? At about 9:20 pm.', 'When will the cruise to Emerald Bay end? On Sunday morning.' ] t1 = time.time() ans = 1 ts = [] for qa in qas: inp = roberta.encode(qa, context) ts.append(inp) batch = collate_tokens(ts, pad_idx=1) logits = roberta.predict('sentence_classification_head', batch, return_logits=True).tolist() logits = np.asarray(logits).flatten() print(logits) # assert np.argmax(logits) == ans t2 = time.time() print("Time cost: {}s".format(t2 - t1))
def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None): return data_utils.collate_tokens( [s[key] for s in samples], pad_idx, eos_idx, left_pad, move_eos_to_beginning, pad_to_length=pad_to_length, pad_to_multiple=pad_to_multiple, )
def processTextInput(self, text): """Generate source tokens from text input""" if self.pre_tokenizer is not None: text = self.pre_tokenizer.encode(text) if self.bpe_tokenizer is not None: text = self.bpe_tokenizer.encode(text) target = self.src_dict.encode_line(text, add_if_not_exist=False, append_eos=True).long() target = fairseq_data_utils.collate_tokens( [target], self.src_dict.pad(), self.src_dict.eos(), left_pad=False, move_eos_to_beginning=False, ) src_lengths = torch.tensor([target.size(1)], dtype=torch.long) prev_output_tokens = None sample = { "net_input": { "src_tokens": target, "src_lengths": src_lengths, "prev_output_tokens": prev_output_tokens, } } sample = utils.move_to_cuda(sample) if self.use_cuda else sample return sample
def collater(self, samples, input_shapes=None): return data_utils.collate_tokens( samples, self.pad_idx, left_pad=self.left_pad, input_shapes=input_shapes, )
def collater(self, samples: List[SpeechToTextJointDatasetItem]) -> Dict: s2t_out = super().collater(samples, return_order=True) if s2t_out == {}: return s2t_out net_input, order = s2t_out["net_input"], s2t_out["order"] if self.src_texts is not None and self.src_dict is not None: src_txt_tokens = fairseq_data_utils.collate_tokens( [x.src_txt_tokens for x in samples], self.src_dict.pad(), self.src_dict.eos(), left_pad=False, move_eos_to_beginning=False, ) src_txt_tokens = src_txt_tokens.index_select(0, order) src_txt_lengths = torch.tensor( [x.src_txt_tokens.size()[0] for x in samples], dtype=torch.long ).index_select(0, order) net_input["src_txt_tokens"] = src_txt_tokens net_input["src_txt_lengths"] = src_txt_lengths if self.tgt_texts is not None and samples[0].tgt_lang_tag is not None: for i in range(len(samples)): net_input["prev_output_tokens"][i][0] = samples[order[i]].tgt_lang_tag out = { "id": s2t_out["id"], "net_input": net_input, "target": s2t_out["target"], "target_lengths": s2t_out["target_lengths"], "ntokens": s2t_out["ntokens"], "nsentences": len(samples), } return out