Esempi in Python per pad_and_truncate, esempi in Python per data_utils.pad_and_truncate

Esempio n. 1

0

Mostra file

File: analyzer.py Progetto: upendra1997/major_project

 def evaluate(self, line, aspect):
     line = str(line)
     aspect = str(aspect)
     text_left, _, text_right = [
         s.lower().strip() for s in line.partition("$T$")
     ]
     aspect = aspect.lower().strip()
     text_raw_indices = self.tokenizer.text_to_sequence(text_left + " " +
                                                        aspect + " " +
                                                        text_right)
     aspect_indices = self.tokenizer.text_to_sequence(aspect)
     aspect_len = np.sum(aspect_indices != 0)
     text_bert_indices = self.tokenizer.text_to_sequence('[CLS] ' +
                                                         text_left + " " +
                                                         aspect + " " +
                                                         text_right +
                                                         ' [SEP] ' +
                                                         aspect + " [SEP]")
     bert_segments_ids = np.asarray([0] *
                                    (np.sum(text_raw_indices != 0) + 2) +
                                    [1] * (aspect_len + 1))
     bert_segments_ids = pad_and_truncate(bert_segments_ids,
                                          self.tokenizer.max_seq_len)
     inp = np.array([[text_bert_indices], [bert_segments_ids]])
     inp = torch.from_numpy(inp)
     inp = inp.to(self.opt.device)
     t_outputs = self.model(inp)
     t_probs = F.softmax(t_outputs, dim=-1).cpu().detach().numpy()
     return t_probs

Esempio n. 2

0

Mostra file

    def evaluate(self, text, aspect):
        aspect = aspect.lower().strip()
        text_left, _, text_right = [
            s.strip() for s in text.lower().partition(aspect)
        ]

        text_indices = self.tokenizer.text_to_sequence(text_left + " " +
                                                       aspect + " " +
                                                       text_right)
        context_indices = self.tokenizer.text_to_sequence(text_left + " " +
                                                          text_right)
        left_indices = self.tokenizer.text_to_sequence(text_left)
        left_with_aspect_indices = self.tokenizer.text_to_sequence(text_left +
                                                                   " " +
                                                                   aspect)
        right_indices = self.tokenizer.text_to_sequence(text_right,
                                                        reverse=True)
        right_with_aspect_indices = self.tokenizer.text_to_sequence(
            aspect + " " + text_right, reverse=True)
        aspect_indices = self.tokenizer.text_to_sequence(aspect)
        left_len = np.sum(left_indices != 0)
        aspect_len = np.sum(aspect_indices != 0)
        aspect_boundary = np.asarray([left_len, left_len + aspect_len - 1],
                                     dtype=np.int64)

        text_len = np.sum(text_indices != 0)
        concat_bert_indices = self.tokenizer.text_to_sequence('[CLS] ' +
                                                              text_left + " " +
                                                              aspect + " " +
                                                              text_right +
                                                              ' [SEP] ' +
                                                              aspect +
                                                              " [SEP]")
        concat_segments_indices = [0] * (text_len + 2) + [1] * (aspect_len + 1)
        concat_segments_indices = pad_and_truncate(concat_segments_indices,
                                                   self.tokenizer.max_seq_len)

        text_bert_indices = self.tokenizer.text_to_sequence("[CLS] " +
                                                            text_left + " " +
                                                            aspect + " " +
                                                            text_right +
                                                            " [SEP]")
        aspect_bert_indices = self.tokenizer.text_to_sequence("[CLS] " +
                                                              aspect +
                                                              " [SEP]")

        data = {
            'concat_bert_indices': concat_bert_indices,
            'concat_segments_indices': concat_segments_indices,
            'text_bert_indices': text_bert_indices,
            'aspect_bert_indices': aspect_bert_indices,
            'text_indices': text_indices,
            'context_indices': context_indices,
            'left_indices': left_indices,
            'left_with_aspect_indices': left_with_aspect_indices,
            'right_indices': right_indices,
            'right_with_aspect_indices': right_with_aspect_indices,
            'aspect_indices': aspect_indices,
            'aspect_boundary': aspect_boundary
        }

        t_inputs = [
            torch.tensor([data[col]], device=self.opt.device)
            for col in self.opt.inputs_cols
        ]
        t_outputs = self.model(t_inputs)
        t_probs = F.softmax(t_outputs, dim=-1).cpu().numpy()

        return t_probs

Esempio n. 3

0

Mostra file

File: infer.py Progetto: ZhengZixiang/ccf_bdci_2019_negative_entity_baseline

        key_entity = []
        for e in entities:
            content = title
            index = content.find(e)
            if index + (tokenizer.max_seq_len // 2) < len(content):
                content = content[:index + (tokenizer.max_seq_len // 2)]
            if len(content) > tokenizer.max_seq_len:
                content = content[-(tokenizer.max_seq_len - len(e) - 3):]

            text_raw_indices = tokenizer.text_to_sequence(content)
            aspect_indices = tokenizer.text_to_sequence(e)
            aspect_len = np.sum(aspect_indices != 0)
            text_bert_indices = tokenizer.text_to_sequence('[CLS] ' + content + ' [SEP] ' + e + ' [SEP]')
            bert_segments_ids = np.asarray([0] * (np.sum(text_raw_indices != 0) + 2) + [1] * (aspect_len + 1))
            bert_segments_ids = pad_and_truncate(bert_segments_ids, tokenizer.max_seq_len)

            text_bert_indices = torch.tensor([text_bert_indices], dtype=torch.int64).to(opt.device)
            bert_segments_ids = torch.tensor([bert_segments_ids], dtype=torch.int64).to(opt.device)

            inputs = [text_bert_indices, bert_segments_ids]
            outputs = model(inputs)
            t_probs = F.softmax(outputs, dim=-1).cpu().numpy()
            sentiment = t_probs.argmax(axis=-1)
            if sentiment == 1:
                key_entity.append(e)

        # remove infered key entities that can be substring of other entities.
        final_res = []
        for e1 in key_entity:
            flag = 0