Beispiel #1
0
    def predict_batch(self, entries, wombat_object=None):
        nl = []
        wd_tokens = []
        for entry in entries:
            input_tokens = entry["input_tokens"]
            ids = self.source2idx(input_tokens)
            nl.append(ids)
            if self.args.tokenize_type != "bpe":
                entry['input_list'] = self.tokenizer.process_nl(input_tokens)
            else:
                entry['input_list'] = self.tokenizer.encode(
                    input_tokens, add_special_tokens=False).tokens
            wd_tokens.append(entry['input_list'])
        self.classifier.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                       pad_tok=self.pad_id,
                                                       nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                               dtype=torch.long,
                                               device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens,
                                                   dtype=torch.long,
                                                   device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape +
                                        (self.args.swd_dim, ),
                                        dtype=torch.float32,
                                        device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    word_to_lookup = wd_tokens[i][j]
                    print('Looking up Wombat for:', word_to_lookup)
                    wombat_emb = wombat_object.get(word_to_lookup)
                    if wombat_emb is not None:
                        print('Found Wombat embedding for:', word_to_lookup)
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)
            de_score = self.classifier(nl_tensor,
                                       nl_len_tensor,
                                       wombat_tensor=wombat_tensor)
            label_mask = nl_tensor > 0
            output_prob, output_idx = self.classifier.inference(de_score)
            # output_idx = de_score.max(-1)[1]
            predict_words = Tokenizer.decode_batch(
                output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw, 1)
            # predict_prob = acc_prob.prod(dim=-1).tolist()
            predict_prob = output_prob.squeeze(-1).tolist()

        for i, entry in enumerate(entries):
            # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i]))
            entry['pred_sequence'] = predict_words[i]
            entry['prob_sequence'] = predict_prob[i]
        return entries
Beispiel #2
0
    def inference(self, rv_text):
        prompt_text = LM.prepare_entry(rv_text)
        encoded_prompt = self.input2tensor(prompt_text)

        length = self.args.length if self.lm.args.model_type == "t5" else self.args.length + len(encoded_prompt[0])
        output_sequences, probs = self.lm.model.generate(
            input_ids=encoded_prompt,
            max_length=length,
            temperature=self.args.temperature,
            top_k=self.args.k,
            top_p=self.args.p,
            repetition_penalty=self.args.repetition_penalty,
            num_beams=self.args.num_beams,
            do_sample=self.args.do_sample,
            num_return_sequences=self.args.num_return_sequences,
            bos_token_id=self.bos_token_id,
            # pad_token_id=self.pad_token_id,
            eos_token_id=self.eos_token_id,
        )

        # Remove the batch dimension when returning multiple sequences
        if len(output_sequences.shape) > 2:
            output_sequences.squeeze_()

        generated_sequences = []

        for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
            # print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
            generated_sequence = generated_sequence.tolist()

            # Decode text
            text = Tokenizer.decode_batch(generated_sequence, self.lm.tokenizer.i2tw, level=1)
            text = " ".join(text)
            # text = self.lm.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True,
            #                                 skip_special_tokens=True)

            # Remove all text after the stop token
            # gen_text = text[: text.find(self.args.stop_token) if self.args.stop_token else None]
            gen_text = text[: text.find(self.lm.tokenizer.eos_token) + len(self.lm.tokenizer.eos_token)
                            if text.find(self.lm.tokenizer.eos_token) != -1 else None]

            if self.lm.args.model_type != "t5":
                gen_text = gen_text[len(self.lm.tokenizer.decode(encoded_prompt[0],
                                                                 clean_up_tokenization_spaces=True,
                                                                 skip_special_tokens=True)):]
            # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
            total_sequence = (prompt_text, gen_text, probs[generated_sequence_idx])
            generated_sequences.append(total_sequence)
            # print("".join(total_sequence))
        return generated_sequences, probs
Beispiel #3
0
    def evaluate_batch(self, eva_data, num_eva):
        start = time.time()
        self.labeler.eval()
        nl_tokens = []
        reference = []
        candidate = []
        predict_probs = []
        dev_loss = []
        total_tokens = 0
        eva_iterdataset = IterDataset(
            eva_data,
            source2idx=self.source2idx,
            target2idx=self.target2idx,
            num_lines=num_eva,
            bpe=True if self.args.tokenize_type == "bpe" else False)
        eva_dataloader = DataLoader(eva_iterdataset,
                                    pin_memory=True,
                                    batch_size=self.args.batch_size,
                                    collate_fn=self.collate_fn)
        with torch.no_grad():
            for i, d in enumerate(eva_dataloader):
                # nl, target = list(zip(*d))
                d = tuple(t.to(self.device) for t in d)
                nl_tensor, lb_tensor = d
                nl_len_tensor = (nl_tensor != self.pad_id).sum(dim=1)

                de_score = self.labeler(nl_tensor, nl_len_tensor)
                label_mask = nl_tensor != self.pad_id
                # TODO: can move NLL into seq2seq for multigpu
                total_loss = self.labeler.NLL_loss(de_score, lb_tensor,
                                                   label_mask)

                dev_loss.append(total_loss.item())
                total_tokens += label_mask.sum()
                output_prob, output_idx = self.labeler.inference(
                    de_score, label_mask)

                label_words = Tokenizer.decode_batch(lb_tensor.tolist(),
                                                     self.tokenizer.i2tw, 2)
                label_words = [
                    words[:i]
                    for words, i in zip(label_words,
                                        label_mask.sum(dim=1).tolist())
                ]
                # reference = [[w1, ..., EOT], ..., [w1, ..., EOT]]
                reference.extend(label_words)

                if self.args.use_crf:
                    predict_words = Tokenizer.decode_batch(
                        output_idx, self.tokenizer.i2tw, 2)
                    # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())]
                    predict_probs += output_prob
                else:
                    # output_idx = de_score.max(-1)[1]
                    predict_words = Tokenizer.decode_batch(
                        output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw,
                        2)
                    predict_words = [
                        words[:i]
                        for words, i in zip(predict_words,
                                            label_mask.sum(dim=1).tolist())
                    ]
                    # predict_prob = acc_prob.prod(dim=-1).tolist()
                    predict_probs += [
                        words[:i] for words, i in zip(
                            output_prob.squeeze(-1).tolist(),
                            label_mask.sum(dim=1).tolist())
                    ]
                # if sum([len(k) for k in predict_words]) != 0:
                # candidate = [[w1, ..., EOT], ..., [w1, ..., EOT]]
                candidate.extend(predict_words)

                if self.args.tokenize_type != "bpe":
                    nl_token = self.tokenizer.decode_batch(
                        nl_tensor.tolist(), self.tokenizer.i2sw, 2)
                    nl_token = [
                        words[:i]
                        if EOT not in words else words[:words.index(EOT)]
                        for words, i in zip(nl_token, (nl_tensor > 0).sum(
                            dim=1).tolist())
                    ]
                else:
                    nl_token = self.tokenizer.decode_batch(nl_tensor.tolist())
                    # nl_token = [enc_words.tokens for enc_words in self.tokenizer.encode_batch(nl_token)]
                    nl_token = [
                        words[0:words.find(EOT)].split() for words in nl_token
                    ]
                nl_tokens.extend(nl_token)
                del nl_tensor, nl_len_tensor, lb_tensor, de_score, label_mask
                # gc.collect()
                # torch.cuda.empty_cache()

        if len(candidate) != 0 and len(reference) != 0:
            assert len(candidate) == len(reference)
            # Randomly sample one pair
            rand_idx = random.randint(0, len(reference) - 1)
            print("\nRANDOMLY sampling: ")
            print("\t- An Input Sequence: ", " ".join(nl_tokens[rand_idx]))
            print("\t- A LABEL query: ", " ".join(reference[rand_idx]))
            print("\t- A PREDICTED query: ", " ".join(candidate[rand_idx]))
            print("\t- A PREDICTED prob: ", predict_probs[rand_idx], "\n\n")
            metrics = Labeler_model.class_metrics(reference, candidate)
        else:
            metrics = [0., 0., 0., 0., 0.]

        end = time.time() - start
        speed = total_tokens / end
        return sum(dev_loss) / len(dev_loss), metrics, speed
Beispiel #4
0
    def predict_batch(self,
                      entries,
                      wombat_object=None,
                      return_probability=False):
        nl = []
        wd_tokens = []
        for entry in entries:
            input_tokens = entry["input_tokens"]
            ids = self.source2idx(input_tokens)
            nl.append(ids)
            if self.args.tokenize_type != "bpe":
                entry['input_list'] = self.tokenizer.process_nl(input_tokens)
            else:
                entry['input_list'] = self.tokenizer.encode(
                    input_tokens, add_special_tokens=False).tokens
            wd_tokens.append(entry['input_list'])
        self.labeler.eval()
        with torch.no_grad():
            nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                       pad_tok=self.pad_id,
                                                       nlevels=1)
            nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                               dtype=torch.long,
                                               device=self.device)
            nl_len_tensor = Data2tensor.idx2tensor(nl_lens,
                                                   dtype=torch.long,
                                                   device=self.device)
            # wombat_tensor = [batch, nl_len, emb_dim]
            wombat_tensor = torch.zeros(nl_tensor.shape +
                                        (self.args.swd_dim, ),
                                        dtype=torch.float32,
                                        device=self.device)
            wombat_idx = (nl_tensor == self.unk_id).nonzero()
            if wombat_object is not None:
                for t, (i, j) in enumerate(wombat_idx.tolist()):
                    word_to_lookup = wd_tokens[i][j]
                    print('Looking up Wombat for:', word_to_lookup)
                    wombat_emb = wombat_object.get(word_to_lookup)
                    if wombat_emb is not None:
                        print('Found Wombat embedding for:', word_to_lookup)
                        wombat_tensor[i, j] = torch.from_numpy(wombat_emb)
            de_score = self.labeler(nl_tensor,
                                    nl_len_tensor,
                                    wombat_tensor=wombat_tensor)
            label_mask = nl_tensor > 0
            if return_probability is False:
                output_prob, output_idx = self.labeler.inference(
                    de_score, label_mask)
                if self.args.use_crf:
                    predict_words = Tokenizer.decode_batch(
                        output_idx, self.tokenizer.i2tw, 2)
                    # predict_words = [words[:i] for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())]
                    predict_prob = list(output_prob)
                else:
                    # output_idx = de_score.max(-1)[1]
                    predict_words = Tokenizer.decode_batch(
                        output_idx.squeeze(-1).tolist(), self.tokenizer.i2tw,
                        2)
                    predict_words = [
                        words[:i]
                        for words, i in zip(predict_words,
                                            label_mask.sum(dim=1).tolist())
                    ]
                    # predict_prob = acc_prob.prod(dim=-1).tolist()
                    predict_prob = [
                        words[:i] for words, i in zip(
                            output_prob.squeeze(-1).tolist(),
                            label_mask.sum(dim=1).tolist())
                    ]

                for i, entry in enumerate(entries):
                    # entry["pred_pair"] = list(zip(entry["input_review"], predict_words[i]))
                    entry['pred_sequence'] = predict_words[i]
                    entry['prob_sequence'] = predict_prob[i]
                    entities_list = NER_metrics.absa_extractor(
                        entry["input_list"], predict_words[i],
                        None if self.args.use_crf else predict_prob[i])
                    entry["entities"] = []
                    if len(entities_list) > 0:
                        for entity, senti, _, prob in entities_list:
                            # entry["entities"].append((entity, senti, prob))
                            entry["entities"].append({
                                "aspect": entity,
                                "polarity": senti,
                                "probability": prob
                            })

                return entries
            else:
                label_prob = torch.softmax(de_score.squeeze(), dim=-1)
                return [{
                    self.tokenizer.i2tw[ind]: prob
                    for ind, prob in enumerate(prob_i)
                } for prob_i in label_prob.tolist()]
Beispiel #5
0
    if use_selfatt:
        # use the maximum length 5 times larger than input length
        nlemb_HPs = [sw_size, 50, None, 0.5, True, True, 1000]
        # nn_mode, ninp, nhid, nlayers, nhead, dropout, activation, norm, his_mask
        enc_HPs = ["self_attention", 50, 200, 6, 10, 0.5, "relu", None, False]
    else:
        nlemb_HPs = [sw_size, 50, None, 0.5, True, True]
        enc_HPs = ["lstm", 50, 200, 2, False, 0.5]
    classifier = Classifier(nlemb_HPs, enc_HPs, drop_rate=0.5, num_labels=len(lb2id_dict))
    de_score = classifier(nl_tensor, nl_len_tensor)
    output_idx = de_score.max(-1)[1]
    de_loss = classifier.NLL_loss(de_score, lb_tensor)

    reference = []
    candidate = []
    label_words = Tokenizer.decode_batch(lb_tensor.squeeze().tolist(), id2lb_dict, 1)
    predict_words = Tokenizer.decode_batch(output_idx.tolist(), id2lb_dict, 1)
    if tokenize_type != "bpe":
        nl_token = tokenizer.decode_batch(nl_tensor.tolist(), tokenizer.i2sw, 2)
        nl_token = [words[:i] if EOT not in words else words[: words.index(EOT)]
                    for words, i in zip(nl_token, (nl_tensor > 0).sum(dim=1).tolist())]
    else:
        nl_token = tokenizer.decode_batch(nl_tensor.tolist())
        # nl_token = [enc_words.tokens for enc_words in self.args.vocab.encode_batch(nl_token)]
        nl_token = [words[0: words.find(EOT)].split() for words in nl_token]
        pass
    # reference = [[w1, ..., EOT], ..., [w1, ..., EOT]]
    reference.extend(label_words)
    candidate.extend(predict_words)
    # test inference
    label_prob, label_pred = classifier.inference(de_score)
Beispiel #6
0
    #     fn_dim += hidden_dim
    #
    # hidden2tag = nn.Linear(fn_dim, len(vocab.tw2i))
    seq2seq = EncDec(nlemb_HPs, enc_HPs, dec_HPs, drop_rate=fn_dropout, num_labels=tw_size, enc_att=enc_att)
    nl_len_tensor = (nl_tensor > pad_id).sum(dim=1)
    random_force = True if random.random() < teacher_forcing_ratio else False
    # print("\nMODEL INPUTs: ", nl_tensor.shape, "\n")
    de_score = seq2seq(nl_tensor, nl_len_tensor, lb_tensor, random_force)
    olb_tensor = lb_tensor[:, 1:]
    label_mask = olb_tensor > 0

    total_loss = seq2seq.NLL_loss(de_score[label_mask], olb_tensor[label_mask]).mean()

    output_idx = de_score.max(-1)[1]
    if tokenize_type != "bpe":
        label_words = vocab.decode_batch(olb_tensor.tolist(), vocab.i2tw, 2)
        label_words = [words[:i] if EOT not in words else words[: words.index(EOT)]
                       for words, i in zip(label_words, label_mask.sum(dim=1).tolist())]

        predict_words = vocab.decode_batch(output_idx.tolist(), vocab.i2tw, 2)
        predict_words = [words[:i] if EOT not in words else words[: words.index(EOT)]
                         for words, i in zip(predict_words, label_mask.sum(dim=1).tolist())]

        nl_token = vocab.decode_batch(nl_tensor.tolist(), vocab.i2sw, 2)
        nl_token = [words[:i] if EOT not in words else words[: words.index(EOT)]
                    for words, i in zip(nl_token, (nl_tensor > 0).sum(dim=1).tolist())]
    else:
        label_words = vocab.decode_batch(olb_tensor.tolist())
        # label_words = [enc_words.tokens for enc_words in self.args.vocab.encode_batch(label_words)]
        label_words = [words[0: words.find(EOT)].split() for words in label_words]