def greedy_predict(self, entries, wombat_object=None, maxlen=2000): nl = [] wd_tokens = [] for entry in entries: wd_tokens.append(entry["question_arg"]) nl.append(self.source2idx(entry["question_arg"])) self.seq2seq.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim,), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): wombat_emb = wombat_object.get(wd_tokens[t][i][j]) if wombat_emb is not None: wombat_tensor[i, j] = torch.from_numpy(wombat_emb) pred_outputs, acc_prob = self.seq2seq.greedy_predict(nl_tensor, nl_len_tensor, maxlen=maxlen, wombat_tensor=wombat_tensor) if self.args.tokenize_type != "bpe": predict_words = self.tokenizer.decode_batch(pred_outputs.tolist(), self.tokenizer.i2tw, 2) predict_words = [words if EOT not in words else words[: words.index(EOT) + 1] for words in predict_words] else: predict_words = self.tokenizer.decode_batch(pred_outputs.tolist()) predict_words = [words[0: words.find(EOT)].split() for words in predict_words] # predict_prob = acc_prob.prod(dim=-1).tolist() predict_prob = acc_prob.squeeze().tolist() for i, entry in enumerate(entries): entry['model_result'] = " ".join(predict_words[i]) entry['pred_prob'] = predict_prob[i] return entries
def predict_batch(self, entries, wombat_object=None): nl = [] wd_tokens = [] for entry in entries: input_tokens = entry["input_tokens"] ids = self.source2idx(input_tokens) nl.append(ids) if self.args.tokenize_type != "bpe": entry['input_list'] = self.tokenizer.process_nl(input_tokens) else: entry['input_list'] = self.tokenizer.encode( input_tokens, add_special_tokens=False).tokens wd_tokens.append(entry['input_list']) self.classifier.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim, ), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): word_to_lookup = wd_tokens[i][j] print('Looking up Wombat for:', word_to_lookup) wombat_emb = wombat_object.get(word_to_lookup) if wombat_emb is not None: print('Found Wombat embedding for:', word_to_lookup) wombat_tensor[i, j] = torch.from_numpy(wombat_emb) de_score = self.classifier(nl_tensor, nl_len_tensor, wombat_tensor=wombat_tensor) label_mask = nl_tensor > 0 output_prob, output_idx = self.classifier.inference(de_score) # output_idx = de_score.max(-1)[1] predict_words = Tokenizer.decode_batch( output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 1) # predict_prob = acc_prob.prod(dim=-1).tolist() predict_prob = output_prob.squeeze(-1).tolist() for i, entry in enumerate(entries): # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i])) entry['pred_sequence'] = predict_words[i] entry['prob_sequence'] = predict_prob[i] return entries
def beam_predict(self, entries, bw=2, topk=2, wombat_object=None, maxlen=2000): nl = [] wd_tokens = [] for entry in entries: wd_tokens.append(entry["question_arg"]) nl.append(self.source2idx(entry["question_arg"])) self.seq2seq.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim,), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): wombat_emb = wombat_object.get(wd_tokens[t][i][j]) if wombat_emb is not None: wombat_tensor[i, j] = torch.from_numpy(wombat_emb) pred_outputs, predict_prob = self.seq2seq.beam_predict(nl_tensor, nl_len_tensor, minlen=1, maxlen=maxlen, bw=bw, n_best=topk, wombat_tensor=wombat_tensor) if self.args.tokenize_type != "bpe": predict_words = self.tokenizer.decode_batch(pred_outputs, self.tokenizer.i2tw, 3) predict_words = [words if EOT not in words else words[: words.index(EOT) + 1] for words in predict_words] predict_words = [[" ".join(words) for words in topk_outputs] for topk_outputs in predict_words] else: predict_words = [self.tokenizer.decode_batch(topk_outputs) for topk_outputs in pred_outputs] predict_words = [[words[0: words.find(EOT)] for words in topk_outputs] for topk_outputs in predict_words] for i, entry in enumerate(entries): entry['model_result'] = predict_words[i][0] entry['pred_prob'] = predict_prob[i][0] entry['decoded_batch'] = list(zip(predict_words[i], predict_prob[i])) return entries
def predict_batch(self, entries, wombat_object=None, return_probability=False): nl = [] wd_tokens = [] for entry in entries: input_tokens = entry["input_tokens"] ids = self.source2idx(input_tokens) nl.append(ids) if self.args.tokenize_type != "bpe": entry['input_list'] = self.tokenizer.process_nl(input_tokens) else: entry['input_list'] = self.tokenizer.encode( input_tokens, add_special_tokens=False).tokens wd_tokens.append(entry['input_list']) self.labeler.eval() with torch.no_grad(): nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=self.pad_id, nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=self.device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=self.device) # wombat_tensor = [batch, nl_len, emb_dim] wombat_tensor = torch.zeros(nl_tensor.shape + (self.args.swd_dim, ), dtype=torch.float32, device=self.device) wombat_idx = (nl_tensor == self.unk_id).nonzero() if wombat_object is not None: for t, (i, j) in enumerate(wombat_idx.tolist()): word_to_lookup = wd_tokens[i][j] print('Looking up Wombat for:', word_to_lookup) wombat_emb = wombat_object.get(word_to_lookup) if wombat_emb is not None: print('Found Wombat embedding for:', word_to_lookup) wombat_tensor[i, j] = torch.from_numpy(wombat_emb) de_score = self.labeler(nl_tensor, nl_len_tensor, wombat_tensor=wombat_tensor) label_mask = nl_tensor > 0 if return_probability is False: output_prob, output_idx = self.labeler.inference( de_score, label_mask) if self.args.use_crf: predict_words = Tokenizer.decode_batch( output_idx, self.tokenizer.i2tw, 2) # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())] predict_prob = list(output_prob) else: # output_idx = de_score.max(-1)[1] predict_words = Tokenizer.decode_batch( output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 2) predict_words = [ words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist()) ] # predict_prob = acc_prob.prod(dim=-1).tolist() predict_prob = [ words[:i] for words, i in zip( output_prob.squeeze(-1).tolist(), label_mask.sum(dim=1).tolist()) ] for i, entry in enumerate(entries): # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i])) entry['pred_sequence'] = predict_words[i] entry['prob_sequence'] = predict_prob[i] entities_list = NER_metrics.absa_extractor( entry["input_list"], predict_words[i], None if self.args.use_crf else predict_prob[i]) entry["entities"] = [] if len(entities_list) > 0: for entity, senti, _, prob in entities_list: # entry["entities"].append((entity, senti, prob)) entry["entities"].append({ "aspect": entity, "polarity": senti, "probability": prob }) return entries else: label_prob = torch.softmax(de_score.squeeze(), dim=-1) return [{ self.tokenizer.i2tw[ind]: prob for ind, prob in enumerate(prob_i) } for prob_i in label_prob.tolist()]
tg2ids = vocab.lst2idx(vocab_words=vocab.tw2i, unk_words=False, sos=True, eos=True) train_data = JSON(filename, source2idx=nl2ids, target2idx=tg2ids) # train_data = Csvfile(filename) data_idx = [] batch = 8 for d in Vocab.minibatches(train_data, batch): data_idx.append(d) nl, target = list(zip(*d)) nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=vocab.sw2i[PAD], nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens, dtype=torch.long, device=device) lb_pad_ids, lb_lens = seqPAD.pad_sequences(target, pad_tok=vocab.tw2i[PAD], nlevels=1) lb_tensor = Data2tensor.idx2tensor(lb_pad_ids, dtype=torch.long, device=device)