Beispiel #1
0
 def test_pack_to_length_text_higher(self):
     tokenized_text = ["i", "am", "going", "to", "write", "tests"]
     length = 3
     tokenized_text_padded = pack_to_length(
         tokenized_text=tokenized_text,
         max_length=length,
         pad_token="<PAD>",
         add_start_end_token=False,
     )
     assert tokenized_text_padded == ["i", "am", "going"]
Beispiel #2
0
 def test_pack_to_length_text_equal_with_start_end_token(self):
     tokenized_text = ["i", "am", "going", "to", "write", "tests"]
     length = 6
     tokenized_text_padded = pack_to_length(
         tokenized_text=tokenized_text,
         max_length=length,
         pad_token="<PAD>",
         add_start_end_token=True,
         start_token="<SOS>",
         end_token="<EOS>",
     )
     assert tokenized_text_padded == ["<SOS>", "i", "am", "going", "to", "<EOS>"]
Beispiel #3
0
def setup_bow_elmo_encoder(request):
    layer_aggregation = request.param
    instances = [
        "I like to eat carrot", "I like to go out on long drives in a car"
    ]
    padded_instances = []
    for instance in instances:
        padded_inst = pack_to_length(tokenized_text=instance.split(),
                                     max_length=10)
        padded_instances.append(" ".join(padded_inst))
    iter_dict = {"instance": padded_instances}
    bow_elmo_embedder = BowElmoEmbedder(layer_aggregation=layer_aggregation)
    return bow_elmo_embedder, iter_dict
Beispiel #4
0
    def get_iter_dict(self, line: str, label: Optional[List[str]] = None):
        word_instance = self.word_tokenizer.tokenize(line)
        len_instance = len(word_instance)
        classnames2idx = ScienceIEDataset.get_classname2idx()
        idx2classname = {idx: classname for classname, idx in classnames2idx.items()}

        padded_word_instance = pack_to_length(
            tokenized_text=word_instance,
            max_length=self.max_instance_length,
            pad_token=self.word_vocab.pad_token,
            add_start_end_token=self.word_add_start_end_token,
            start_token=self.word_vocab.start_token,
            end_token=self.word_vocab.end_token,
        )
        tokens = self.word_numericalizer.numericalize_instance(padded_word_instance)
        tokens = torch.LongTensor(tokens)
        len_tokens = torch.LongTensor([len_instance])

        character_tokens = []
        # 1. For every word we get characters in the word
        # 2. Pad the characters to max_char_length
        # 3. Convert them into numbers
        # 4. Add them to character_tokens
        for word in padded_word_instance:
            char_instance = self.char_tokenizer.tokenize(word)
            padded_character_instance = pack_to_length(
                tokenized_text=char_instance,
                max_length=self.max_char_length,
                pad_token=" ",
                add_start_end_token=False,
            )
            padded_character_tokens = self.char_numericalizer.numericalize_instance(
                padded_character_instance
            )
            character_tokens.append(padded_character_tokens)
        character_tokens = torch.LongTensor(character_tokens)

        instance_dict = {
            "tokens": tokens,
            "len_tokens": len_tokens,
            "instance": " ".join(padded_word_instance),
            "raw_instance": " ".join(word_instance),
            "char_tokens": character_tokens,
        }

        if label is not None:
            assert len_instance == len(label)
            task_labels = []
            process_labels = []
            material_labels = []

            for string in label:
                task_label, process_label, material_label = string.split(":")
                task_labels.append(task_label)
                process_labels.append(process_label)
                material_labels.append(material_label)

            assert len_instance == len(task_labels)
            assert len_instance == len(process_labels)
            assert len_instance == len(material_labels)

            padded_task_labels = pack_to_length(
                tokenized_text=task_labels,
                max_length=self.max_instance_length,
                pad_token="padding-Task",
                add_start_end_token=self.word_add_start_end_token,
                start_token="starting-Task",
                end_token="ending-Task",
            )

            padded_process_labels = pack_to_length(
                tokenized_text=process_labels,
                max_length=self.max_instance_length,
                pad_token="padding-Process",
                add_start_end_token=self.word_add_start_end_token,
                start_token="starting-Process",
                end_token="ending-Process",
            )

            padded_material_labels = pack_to_length(
                tokenized_text=material_labels,
                max_length=self.max_instance_length,
                pad_token="padding-Material",
                add_start_end_token=self.word_add_start_end_token,
                start_token="starting-Material",
                end_token="ending-Material",
            )
            assert (
                len(padded_task_labels)
                == len(padded_process_labels)
                == len(padded_material_labels)
            )
            padded_task_labels = [classnames2idx[label] for label in padded_task_labels]

            # Ugly offsetting because we are using continuous numbers for classes in all entity
            # types but science ie dataset requires 0
            padded_process_labels = [
                classnames2idx[label] for label in padded_process_labels
            ]
            padded_material_labels = [
                classnames2idx[label] for label in padded_material_labels
            ]

            mask_task_label = [
                1 if idx2classname[class_idx] in self.ignore_labels else 0
                for class_idx in padded_task_labels
            ]
            mask_process_label = [
                1 if idx2classname[class_idx] in self.ignore_labels else 0
                for class_idx in padded_process_labels
            ]
            mask_material_label = [
                1 if idx2classname[class_idx] in self.ignore_labels else 0
                for class_idx in padded_material_labels
            ]

            task_label = torch.LongTensor(padded_task_labels)
            process_label = torch.LongTensor(padded_process_labels)
            material_label = torch.LongTensor(padded_material_labels)
            mask_task_label = torch.ByteTensor(mask_task_label)
            mask_process_label = torch.ByteTensor(mask_process_label)
            mask_material_label = torch.ByteTensor(mask_material_label)

            label = torch.cat([task_label, process_label, material_label], dim=0)
            label_mask = torch.cat(
                [mask_task_label, mask_process_label, mask_material_label], dim=0
            )
            instance_dict["label"] = label
            instance_dict["label_mask"] = label_mask

        return instance_dict
Beispiel #5
0
    def get_iter_dict(self, line: str, label: Optional[List[str]] = None):
        word_instance = self.word_tokenizer.tokenize(line)
        len_instance = len(word_instance)
        classnames2idx = ParscitDataset.get_classname2idx()
        idx2classname = {idx: classname for classname, idx in classnames2idx.items()}

        if self.instance_preprocessor is not None:
            word_instance = self.instance_preprocessor.lowercase(word_instance)

        padded_word_instance = pack_to_length(
            tokenized_text=word_instance,
            max_length=self.max_instance_length,
            pad_token=self.word_vocab.pad_token,
            add_start_end_token=self.word_add_start_end_token,
            start_token=self.word_vocab.start_token,
            end_token=self.word_vocab.end_token,
        )
        tokens = self.word_numericalizer.numericalize_instance(padded_word_instance)
        tokens = torch.LongTensor(tokens)
        len_tokens = torch.LongTensor([len_instance])

        character_tokens = []
        # 1. For every word we get characters in the word
        # 2. Pad the characters to max_char_length
        # 3. Convert them into numbers
        # 4. Add them to character_tokens
        for word in padded_word_instance:
            char_instance = self.char_tokenizer.tokenize(word)
            padded_character_instance = pack_to_length(
                tokenized_text=char_instance,
                max_length=self.max_char_length,
                pad_token=" ",
                add_start_end_token=False,
            )
            padded_character_tokens = self.char_numericalizer.numericalize_instance(
                padded_character_instance
            )
            character_tokens.append(padded_character_tokens)
        character_tokens = torch.LongTensor(character_tokens)

        instance_dict = {
            "tokens": tokens,
            "len_tokens": len_tokens,
            "instance": " ".join(padded_word_instance),
            "raw_instance": " ".join(word_instance),
            "char_tokens": character_tokens,
        }

        if label is not None:
            assert len_instance == len(label)
            padded_labels = pack_to_length(
                tokenized_text=label,
                max_length=self.max_instance_length,
                pad_token="padding",
                add_start_end_token=self.word_add_start_end_token,
                start_token="starting",
                end_token="ending",
            )
            padded_labels = [classnames2idx[label] for label in padded_labels]
            labels_mask = []
            for class_idx in padded_labels:
                if idx2classname[class_idx] in self.ignored_labels:
                    labels_mask.append(1)
                else:
                    labels_mask.append(0)
            label = torch.LongTensor(padded_labels)
            labels_mask = torch.ByteTensor(labels_mask)

            instance_dict["label"] = label
            instance_dict["label_mask"] = labels_mask

        return instance_dict
Beispiel #6
0
    def get_iter_dict(
        self,
        lines: Union[List[str], str],
        labels: Optional[Union[str, List[str]]] = None,
    ):
        if isinstance(lines, str):
            lines = [lines]

        word_instances = self.word_tokenizer.tokenize_batch(lines)
        len_instances = [len(instance) for instance in word_instances]
        classnames2idx = SectLabelDataset.get_classname2idx()

        padded_instances = []
        for word_instance in word_instances:
            padded_instance = pack_to_length(
                tokenized_text=word_instance,
                max_length=self.max_instance_length,
                pad_token=self.word_vocab.pad_token,
                add_start_end_token=True,
                start_token=self.word_vocab.start_token,
                end_token=self.word_vocab.end_token,
            )
            padded_instances.append(padded_instance)

        tokens = self.word_numericalizer.numericalize_batch_instances(
            padded_instances)
        tokens = torch.LongTensor(tokens)
        tokens = tokens.squeeze(0)

        instances = []
        for instance in padded_instances:
            instances.append(" ".join(instance))

        raw_instances = []
        for instance in word_instances:
            raw_instances.append(" ".join(instance))

        len_tokens = torch.LongTensor(len_instances)

        # squeeze the dimensions if there are more than one dimension

        if len(instances) == 1:
            instances = instances[0]
            raw_instances = raw_instances[0]

        instance_dict = {
            "tokens": tokens,
            "len_tokens": len_tokens,
            "instance": instances,
            "raw_instance": raw_instances,
        }

        if labels is not None:
            if isinstance(labels, str):
                labels = [labels]

            labels = [classnames2idx[label] for label in labels]
            label = torch.LongTensor(labels)
            instance_dict["label"] = label

        return instance_dict
Beispiel #7
0
    def forward(self, iter_dict: Dict[str, Any]) -> torch.Tensor:
        """

        Parameters
        ----------
        iter_dict : Dict[str, Any]
            It expects "raw_intance" to be present in the iter dict.
            "raw_instance" is the instance that is not padded

        Returns
        -------
        torch.Tensor
            The bert embeddings for all the words in the instances
            The size of the returned embedding is ``[batch_size, num_time_steps, emb_dim]``

        """

        # word_tokenize all the text string in the batch
        x = iter_dict["raw_instance"]
        tokenized_text = list(map(self.bert_tokenizer.tokenize, x))
        lengths = list(map(lambda tokenized: len(tokenized), tokenized_text))
        max_len = sorted(lengths, reverse=True)[0]

        # pad the tokenized text to a maximum length
        padded_tokenized_text = []
        for tokens in tokenized_text:
            padded_tokens = pack_to_length(
                tokenized_text=tokens,
                max_length=max_len,
                pad_token="[PAD]",
                add_start_end_token=True,
                start_token="[CLS]",
                end_token="[SEP]",
            )
            padded_tokenized_text.append(padded_tokens)

        # convert them to ids based on bert vocab
        indexed_tokens = list(
            map(self.bert_tokenizer.convert_tokens_to_ids,
                padded_tokenized_text))
        segment_ids = list(
            map(lambda tokens_list: [0] * len(tokens_list), indexed_tokens))

        tokens_tensor = torch.tensor(indexed_tokens)
        segment_tensor = torch.tensor(segment_ids)

        tokens_tensor = tokens_tensor.to(self.device)
        segment_tensor = segment_tensor.to(self.device)

        with torch.no_grad():
            encoded_layers, _ = self.model(tokens_tensor, segment_tensor)

        if "base" in self.bert_type:
            assert len(encoded_layers) == 12
        elif "large" in self.bert_type:
            assert len(encoded_layers) == 24

        # num_bert_layers, batch_size, sequence_length, bert_hidden_dimension
        all_layers = torch.stack(encoded_layers, dim=0)

        if self.aggregation_type == "sum":
            sum_layers = torch.sum(all_layers, dim=0)
            return sum_layers

        elif self.aggregation_type == "average":
            average_layers = torch.mean(all_layers, dim=0)
            return average_layers