Exemple #1
0
    def test_alternative_dtypes(self):
        shape = [3, 4, 5, 6]
        array = numpy.zeros(shape)

        # Setting dtype to numpy.int64 should produce a torch.LongTensor when field is converted to
        # a tensor
        array_field1 = TensorField(array, dtype=numpy.int64)
        returned_tensor1 = array_field1.as_tensor(
            array_field1.get_padding_lengths())
        assert returned_tensor1.dtype == torch.int64

        # Setting dtype to numpy.uint8 should produce a torch.ByteTensor when field is converted to
        # a tensor
        array_field2 = TensorField(array, dtype=numpy.uint8)
        returned_tensor2 = array_field2.as_tensor(
            array_field2.get_padding_lengths())
        assert returned_tensor2.dtype == torch.uint8

        # Padding should not affect dtype
        padding_lengths = {
            "dimension_" + str(i): 10
            for i, _ in enumerate(shape)
        }
        padded_tensor = array_field2.as_tensor(padding_lengths)
        assert padded_tensor.dtype == torch.uint8

        # Empty fields should have the same dtype
        empty_field = array_field2.empty_field()
        assert empty_field.tensor.dtype == array_field2.tensor.dtype
Exemple #2
0
 def test_get_padding_lengths_correctly_returns_ordered_shape(self):
     shape = [3, 4, 5, 6]
     array = numpy.zeros(shape)
     array_field = TensorField(array)
     lengths = array_field.get_padding_lengths()
     for i in range(len(lengths)):
         assert lengths["dimension_{}".format(i)] == shape[i]
Exemple #3
0
 def test_eq(self):
     array1 = TensorField(numpy.asarray([1, 1, 1]))
     array2 = TensorField(numpy.asarray([[1, 1, 1], [1, 1, 1]]))
     array3 = TensorField(numpy.asarray([1, 1, 2]))
     array4 = TensorField(numpy.asarray([1, 1, 1]))
     assert array1 != array2
     assert array1 != array3
     assert array1 == array4
Exemple #4
0
 def test_human_readable_repr(self):
     array = TensorField(numpy.asarray([1.0, 1, 1]))
     ans = {
         "shape": [3],
         "element_mean": 1.0,
         "element_std": 0.0,
         "type": "float64",
     }
     assert array.human_readable_repr() == ans
Exemple #5
0
    def test_as_tensor_handles_larger_padding_dimensions(self):
        shape = [3, 4]
        array = numpy.ones(shape)
        array_field = TensorField(array)

        padded_tensor = (array_field.as_tensor({
            "dimension_0": 5,
            "dimension_1": 6
        }).detach().cpu().numpy())
        numpy.testing.assert_array_equal(padded_tensor[:3, :4], array)
        numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.0)
Exemple #6
0
    def test_padding_handles_list_fields_with_padding_values(self):
        array1 = TensorField(numpy.ones([2, 3]), padding_value=-1)
        array2 = TensorField(numpy.ones([1, 5]), padding_value=-1)
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = (list_field.as_tensor(
            list_field.get_padding_lengths()).detach().cpu().numpy())
        correct_tensor = numpy.array([
            [[1.0, 1.0, 1.0, -1.0, -1.0], [1.0, 1.0, 1.0, -1.0, -1.0]],
            [[1.0, 1.0, 1.0, 1.0, 1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]],
            [[-1.0, -1.0, -1.0, -1.0, -1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]],
        ])
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Exemple #7
0
    def text_to_instance(self,
                         word1: str,
                         word2: str,
                         score: Optional[float] = None) -> Instance:
        fields: Dict[str, Field] = {}

        def add_special_tokens(tokens):
            # add special token
            if self._add_special_symbols:
                tokens.insert(0, Token(copy.deepcopy(self._start_symbol)))
                tokens.append(Token(copy.deepcopy(self._end_symbol)))
            return tokens

        word1 = self._tokenizer.tokenize(word1)
        word2 = self._tokenizer.tokenize(word2)

        if self._combine_input_fields:
            # this will be required for encoder type architecture
            raise NotImplementedError
        else:
            word1_tokens = add_special_tokens(word1)
            word2_tokens = add_special_tokens(word2)
            fields['word1'] = TextField(word1_tokens, self._token_indexer)
            fields['word2'] = TextField(word2_tokens, self._token_indexer)

        if score is not None:
            score = np.array(float(score))
            score = score.astype('double')
            fields['score'] = TensorField(score)
        return Instance(fields)
Exemple #8
0
    def text_to_instance(self, index: int, field_type: str):  # type: ignore
        field = TextField(
            [Token(t) for t in ["The", "number", "is",
                                str(index), "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )

        return Instance({
            "text":
            field,
            "label":
            LabelField(index, skip_indexing=True),
            "flag":
            FlagField(23),
            "index":
            IndexField(index % self.batch_size, field),
            "metadata":
            MetadataField(
                {"some_key": "This will not be logged as a histogram."}),
            "adjacency":
            AdjacencyField([(0, 1), (1, 2)], field),
            "multilabel":
            MultiLabelField(["l1", "l2"]),
            "span":
            SpanField(2, 3, field),
            "tensor":
            TensorField(torch.randn(2, 3)),
        })
def test_get_inverse_hvp_lissa():
    vs = [torch.tensor([1.0, 1.0])]
    # create a fake model
    vocab = Vocabulary()
    params = torch.tensor([1, 2]).float()
    model = DummyBilinearModelForTestingIF(vocab, params)
    used_params = list(model.parameters())

    # create a fake instance: just a matrix
    A = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
    fake_instance = Instance({"tensors": TensorField(A)})

    # wrap fake instance into dataloader
    lissa_data_loader = SimpleDataLoader([fake_instance],
                                         batch_size=1,
                                         batches_per_epoch=1)

    inverse_hvp = get_inverse_hvp_lissa(
        vs=vs,
        model=model,
        used_params=used_params,
        lissa_data_loader=lissa_data_loader,
        damping=0.0,
        num_samples=1,
        scale=1.0,
    )
    # I tried to increase recursion depth to actually approx the inverse Hessian vector product,
    # but I suspect due to extremely small number of data point, the algorithm doesn't work well
    # on this toy example
    ans = torch.tensor([-1.5, -4.5])
    assert torch.equal(inverse_hvp, ans)
Exemple #10
0
    def text_to_instance(self, sentence: str, label: str = None):
        texts = [t.text for t in self.tokenizer.tokenize(sentence)]
        e1_start_position = texts.index(ENT)
        e1_end_position = list_rindex(texts, ENT)

        e2_start_position = texts.index(ENT2)
        e2_end_position = list_rindex(texts, ENT2)

        tokens = [Token(t) for t in texts]
        text_field = TextField(tokens, token_indexers=self.token_indexers)

        fields = {
            "word_ids":
            text_field,
            "entity1_span":
            SpanField(e1_start_position, e1_end_position, text_field),
            "entity2_span":
            SpanField(e2_start_position, e2_end_position, text_field),
            "input_sentence":
            MetadataField(sentence),
        }

        if label is not None:
            fields["label"] = LabelField(label)

        if self.use_entity_feature:
            fields["entity_ids"] = TensorField(
                np.array([self.head_entity_id, self.tail_entity_id]))

        return Instance(fields)
Exemple #11
0
 def text_to_instance(self,
                      index: int,
                      source: str,
                      target: str = None) -> Instance:  # type: ignore
     fields: Dict[str, Field] = {}
     fields["source"] = TextField(self.tokenizer.tokenize(source))
     fields["index"] = MetadataField(index)  # type: ignore
     # It's important to have tests that use a tensor field since sending tensors
     # between processes has a lot of pitfalls.
     fields["tensor"] = TensorField(torch.tensor([1, 2, 3]))
     if target is not None:
         fields["target"] = TextField(self.tokenizer.tokenize(target))
     return Instance(fields)  # type: ignore
Exemple #12
0
 def add_masked_fields(
     self,
     tokens: List[Token],
     fields: Dict[str, Field]
 ):
     masked = [True if random() < 0.15 else False for _ in range(len(tokens))]
     if not any(x for x in masked):
         masked[randint(0, len(masked) - 1)] = True
     masked_tokens = [t if not masked[i] else Token("[MASK]") for i, t in enumerate(tokens)]
     masked_positions = [i for i in range(len(masked)) if masked[i]]
     fields["masked_text"] = TextField(masked_tokens, self._token_indexers)
     fields["masked_positions"] = TensorField(torch.tensor(masked_positions))
     fields["true_masked_ids"] = TextField([t for i, t in enumerate(tokens) if masked[i]], self._token_indexers)
Exemple #13
0
 def test_sanity_check_callback(self):
     model_with_bias = FakeModelForTestingNormalizationBiasVerification(
         use_bias=True)
     inst = Instance({"x": TensorField(torch.rand(3, 1, 4))})
     data_loader = SimpleDataLoader([inst, inst], 2)
     trainer = GradientDescentTrainer(
         model_with_bias,
         self.optimizer,
         data_loader,
         num_epochs=1,
         serialization_dir=self.TEST_DIR,
         callbacks=[SanityChecksCallback(serialization_dir=self.TEST_DIR)],
     )
     with pytest.raises(SanityCheckError):
         trainer.train()
Exemple #14
0
    def test_human_readable_repr(self):
        words_field = TextField([Token("hello")], {})
        label_field = LabelField(1, skip_indexing=True)
        instance1 = Instance({"words": words_field, "labels": label_field})

        assert type(instance1.human_readable_dict()) is dict
        assert instance1.human_readable_dict() == {"words": ["hello"], "labels": 1}

        instance1_human_readable_dict = instance1.human_readable_dict()
        array = TensorField(numpy.asarray([1.0, 1, 1]))
        array_human_readable_dict = {
            "shape": [3],
            "element_mean": 1.0,
            "element_std": 0,
            "type": "float64",
        }
        instance2 = Instance({"words": words_field, "labels": label_field, "tensor": array})
        instance1_human_readable_dict["tensor"] = array_human_readable_dict
        assert instance1_human_readable_dict == instance2.human_readable_dict()
Exemple #15
0
    def test_sanity_check_default(self):
        model_with_bias = FakeModelForTestingNormalizationBiasVerification(use_bias=True)
        inst = Instance({"x": TensorField(torch.rand(3, 1, 4))})
        data_loader = SimpleDataLoader([inst, inst], 2)
        trainer = GradientDescentTrainer.from_partial_objects(
            model_with_bias,
            serialization_dir=self.TEST_DIR,
            data_loader=data_loader,
            num_epochs=1,
        )
        with pytest.raises(SanityCheckError):
            trainer.train()

        trainer = GradientDescentTrainer.from_partial_objects(
            model_with_bias,
            serialization_dir=self.TEST_DIR,
            data_loader=data_loader,
            num_epochs=1,
            run_sanity_checks=False,
        )

        # Check is not run, so no failure.
        trainer.train()
Exemple #16
0
    def text_to_instance(self, sentence: str, labels: List[str] = None):
        texts = [t.text for t in self.tokenizer.tokenize(sentence)]

        ent_start_position = texts.index(ENT)
        ent_end_position = list_rindex(texts, ENT)

        tokens = [Token(t) for t in texts]
        text_field = TextField(tokens, token_indexers=self.token_indexers)
        fields = {
            "word_ids":
            text_field,
            "entity_span":
            SpanField(ent_start_position, ent_end_position, text_field),
            "input_sentence":
            MetadataField(sentence),
        }

        if labels is not None:
            fields["labels"] = MultiLabelField(labels)

        if self.use_entity_feature:
            fields["entity_ids"] = TensorField(np.array([self.entity_id]))

        return Instance(fields)
    def _read(self, file_path: str):
        file_path = cached_path(file_path, extract_archive=True)
        files_in_split = set()
        i = 0
        with open(file_path, "r") as f:
            for i, line in enumerate(f):
                if self.max_instances is not None and i * 5 >= self.max_instances:
                    break
                files_in_split.add(line.rstrip("\n"))

        caption_dicts = []
        for filename in sorted(os.listdir(self.data_dir)):
            if filename.split(".")[0] in files_in_split:
                full_file_path = os.path.join(self.data_dir, filename)
                caption_dicts.append(get_caption_data(full_file_path))

        processed_images: Iterable[Optional[Tuple[Tensor, Tensor,
                                                  Optional[Tensor],
                                                  Optional[Tensor]]]]
        filenames = [
            f"{caption_dict['image_id']}.jpg" for caption_dict in caption_dicts
        ]
        try:
            processed_images = self._process_image_paths(
                self.images[filename]
                for filename in tqdm.tqdm(filenames, desc="Processing images"))
        except KeyError as e:
            missing_id = e.args[0]
            raise KeyError(
                missing_id,
                f"We could not find an image with the id {missing_id}. "
                "Because of the size of the image datasets, we don't download them automatically. "
                "Please go to https://shannon.cs.illinois.edu/DenotationGraph/, download the datasets you need, "
                "and set the image_dir parameter to point to your download location. This dataset "
                "reader does not care about the exact directory structure. It finds the images "
                "wherever they are.",
            )

        features_list = []
        averaged_features_list = []
        coordinates_list = []
        masks_list = []
        for features, coords, _, _ in processed_images:
            features_list.append(TensorField(features))
            averaged_features_list.append(torch.mean(features, dim=0))
            coordinates_list.append(TensorField(coords))
            masks_list.append(
                ArrayField(
                    features.new_ones((features.shape[0], ), dtype=torch.bool),
                    padding_value=False,
                    dtype=torch.bool,
                ))

        # Validation instances are a 1000-way multiple choice,
        # one for each image in the validation set.
        if self.is_evaluation:
            for image_index in range(len(caption_dicts)):
                caption_dict = caption_dicts[image_index]
                for caption_index in range(len(caption_dict["captions"])):
                    instance = self.text_to_instance(
                        caption_dicts=caption_dicts,
                        image_index=image_index,
                        caption_index=caption_index,
                        features_list=features_list,
                        coordinates_list=coordinates_list,
                        masks_list=masks_list,
                        label=image_index,
                    )

                    if instance is not None:
                        yield instance
        else:
            # Shape: (num_images, image_dimension)
            averaged_features = torch.stack(averaged_features_list, dim=0)
            del averaged_features_list

            # Shape: (num_images, num_captions_per_image = 5, caption_dimension)
            caption_tensor = self.get_caption_features(caption_dicts)

            for image_index, caption_dict in enumerate(caption_dicts):
                for caption_index in range(len(caption_dict["captions"])):
                    hard_negative_features, hard_negative_coordinates = self.get_hard_negatives(
                        image_index,
                        caption_index,
                        caption_dicts,
                        averaged_features,
                        features_list,
                        coordinates_list,
                        caption_tensor,
                    )

                    instance = self.text_to_instance(
                        caption_dicts=caption_dicts,
                        image_index=image_index,
                        caption_index=caption_index,
                        features_list=features_list,
                        coordinates_list=coordinates_list,
                        masks_list=masks_list,
                        hard_negative_features=hard_negative_features,
                        hard_negative_coordinates=hard_negative_coordinates,
                    )

                    if instance is not None:
                        yield instance
Exemple #18
0
 def _read(self, file_path: str):
     for i in range(10):
         yield Instance({"x": TensorField(torch.tensor([i]))})
Exemple #19
0
 def test_len_works_with_scalar(self):
     array = TensorField(numpy.asarray(42))
     assert len(array) == 1
Exemple #20
0
    def data_to_instance(self, words: List[str], labels: List[str],
                         sentence_boundaries: List[int], doc_index: str):
        if self.tokenizer is None:
            tokens = [[Token(w)] for w in words]
        else:
            tokens = [self.tokenizer.tokenize(w) for w in words]
        subwords = [sw for token in tokens for sw in token]

        subword2token = list(
            itertools.chain(*[[i] * len(token)
                              for i, token in enumerate(tokens)]))
        token2subword = [0] + list(
            itertools.accumulate(len(token) for token in tokens))
        subword_start_positions = frozenset(token2subword)
        subword_sentence_boundaries = [
            sum(len(token) for token in tokens[:p])
            for p in sentence_boundaries
        ]

        # extract entities from IOB tags
        # we need to pass sentence by sentence
        entities: List[Entity] = []
        for s, e in zip(sentence_boundaries[:-1], sentence_boundaries[1:]):
            for ent in Entities([labels[s:e]], scheme=IOB1).entities[0]:
                ent.start += s
                ent.end += s
                entities.append(ent)

        span_to_entity_label: Dict[Tuple[int, int], str] = dict()
        for ent in entities:
            subword_start = token2subword[ent.start]
            subword_end = token2subword[ent.end]
            span_to_entity_label[(subword_start, subword_end)] = ent.tag

        # split data according to sentence boundaries
        for n in range(len(subword_sentence_boundaries) - 1):
            # process (sub) words
            doc_sent_start, doc_sent_end = subword_sentence_boundaries[n:n + 2]
            assert doc_sent_end - doc_sent_start < self.max_num_subwords

            left_length = doc_sent_start
            right_length = len(subwords) - doc_sent_end
            sentence_length = doc_sent_end - doc_sent_start
            half_context_length = int(
                (self.max_num_subwords - sentence_length) / 2)

            if left_length < right_length:
                left_context_length = min(left_length, half_context_length)
                right_context_length = min(
                    right_length, self.max_num_subwords - left_context_length -
                    sentence_length)
            else:
                right_context_length = min(right_length, half_context_length)
                left_context_length = min(
                    left_length, self.max_num_subwords - right_context_length -
                    sentence_length)

            doc_offset = doc_sent_start - left_context_length
            word_ids = subwords[doc_offset:doc_sent_end + right_context_length]

            if isinstance(self.tokenizer, PretrainedTransformerTokenizer):
                word_ids = self.tokenizer.add_special_tokens(word_ids)

            # process entities
            entity_start_positions = []
            entity_end_positions = []
            entity_ids = []
            entity_position_ids = []
            original_entity_spans = []
            labels = []

            for entity_start in range(left_context_length,
                                      left_context_length + sentence_length):
                doc_entity_start = entity_start + doc_offset
                if doc_entity_start not in subword_start_positions:
                    continue
                for entity_end in range(
                        entity_start + 1,
                        left_context_length + sentence_length + 1):
                    doc_entity_end = entity_end + doc_offset
                    if doc_entity_end not in subword_start_positions:
                        continue

                    if entity_end - entity_start > self.max_mention_length:
                        continue

                    entity_start_positions.append(entity_start + 1)
                    entity_end_positions.append(entity_end)
                    entity_ids.append(self.entity_id)

                    position_ids = list(range(entity_start + 1,
                                              entity_end + 1))
                    position_ids += [-1] * (self.max_mention_length -
                                            entity_end + entity_start)
                    entity_position_ids.append(position_ids)

                    original_entity_spans.append(
                        (subword2token[doc_entity_start],
                         subword2token[doc_entity_end - 1] + 1))
                    labels.append(
                        span_to_entity_label.pop(
                            (doc_entity_start, doc_entity_end), NON_ENTITY))

            # split instances
            split_size = math.ceil(len(entity_ids) / self.max_entity_length)
            for i in range(split_size):
                entity_size = math.ceil(len(entity_ids) / split_size)
                start = i * entity_size
                end = start + entity_size
                fields = {
                    "word_ids":
                    TextField(word_ids, token_indexers=self.token_indexers),
                    "entity_start_positions":
                    TensorField(np.array(entity_start_positions[start:end])),
                    "entity_end_positions":
                    TensorField(np.array(entity_end_positions[start:end])),
                    "original_entity_spans":
                    TensorField(np.array(original_entity_spans[start:end]),
                                padding_value=-1),
                    "labels":
                    ListField([LabelField(l) for l in labels[start:end]]),
                    "doc_id":
                    MetadataField(doc_index),
                    "input_words":
                    MetadataField(words),
                }

                if self.use_entity_feature:
                    fields.update({
                        "entity_ids":
                        TensorField(np.array(entity_ids[start:end]),
                                    padding_value=0),
                        "entity_position_ids":
                        TensorField(np.array(entity_position_ids[start:end])),
                    })

                yield Instance(fields)

        assert len(span_to_entity_label) == 0
Exemple #21
0
    def text_to_instance(
        self,
        source_string: str,
        target_string: str = None,
        weight: float = None,
    ) -> Instance:  # type: ignore
        """
        Turn raw source string and target string into an `Instance`.

        # Parameters

        source_string : `str`, required

        target_string : `str`, optional (default = `None`)

        weight : `float`, optional (default = `None`)
            An optional weight to assign to this instance when calculating the loss in
            [CopyNetSeq2Seq.forward()](../../models/copynet_seq2seq/#forward.parameters).

        # Returns

        `Instance`
            See the above for a description of the fields that the instance will contain.
        """

        tokenized_source = self._source_tokenizer.tokenize(source_string)
        if not tokenized_source:
            # If the tokenized source is empty, it will cause issues downstream.
            raise ValueError(
                f"source tokenizer produced no tokens from source '{source_string}'"
            )
        source_field = TextField(tokenized_source)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source, self._target_namespace)

        meta_fields = {"source_tokens": [x.text for x in tokenized_source]}
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field
        }

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source)]
            fields_dict["source_token_ids"] = TensorField(
                torch.tensor(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ):]
            fields_dict["target_token_ids"] = TensorField(
                torch.tensor(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source)
            fields_dict["source_token_ids"] = TensorField(
                torch.tensor(source_token_ids))

        fields_dict["metadata"] = MetadataField(meta_fields)

        if weight is not None:
            fields_dict["weight"] = TensorField(
                torch.tensor(float(weight), dtype=torch.float))

        return Instance(fields_dict)
Exemple #22
0
 def test_printing_doesnt_crash(self):
     array = TensorField(numpy.ones([2, 3]), padding_value=-1)
     print(array)
Exemple #23
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)

        # load data from csv file
        dataset = pd.read_csv(file_path, dtype=str, keep_default_na=False)

        logger.info("Reading the dataset")

        # for testing
        normalize_logits = False

        # for error catching
        flag = True
        count_total = 0
        count_mismatch = 0
        random_logits = False
        # generates an iterable of Instances
        generator_tqdm = Tqdm.tqdm(dataset.iterrows())
        for i, datapoint in generator_tqdm:
            # for i, datapoint in dataset.iterrows():
            count_total += 1
            try:
                # context
                paragraph = datapoint.at["context_text"]
                tokenized_paragraph = self._tokenizer.tokenize(paragraph)

                # query
                question_text = datapoint.at["question_text"].strip().replace(
                    "\n", "")

                # because we are using squad 1.1
                is_impossible = False

                # answer
                answer_texts = [datapoint.at["answer_text"]]
                span_starts = [int(datapoint.at["start_position_character"])]
                span_ends = [
                    start + len(answer)
                    for start, answer in zip(span_starts, answer_texts)
                ]

                # id
                additional_metadata = {"id": datapoint.at["qas_id"]}

                # create Instance (without teacher logits)
                instance = self.text_to_instance(
                    question_text,
                    paragraph,
                    is_impossible=is_impossible,
                    char_spans=zip(span_starts, span_ends),
                    answer_texts=answer_texts,
                    passage_tokens=tokenized_paragraph,
                    additional_metadata=additional_metadata,
                )

                # teacher logits
                span_start_teacher_logits = np.fromstring(
                    datapoint.at["start_logits"].replace("\n", "").strip("[]"),
                    sep=" ")
                span_end_teacher_logits = np.fromstring(
                    datapoint.at["end_logits"].replace("\n", "").strip("[]"),
                    sep=" ")

                if normalize_logits:
                    span_start_teacher_logits = span_start_teacher_logits / (
                        1e-6 + np.max(np.abs(span_start_teacher_logits)))
                    span_end_teacher_logits = span_end_teacher_logits / (
                        1e-6 + np.max(np.abs(span_end_teacher_logits)))
                # for testing
                if random_logits:
                    span_start_teacher_logits = np.random.random(
                        span_start_teacher_logits.shape)
                    span_end_teacher_logits = np.random.random(
                        span_end_teacher_logits.shape)

                # add teacher logits to Instance
                instance.add_field(
                    "span_start_teacher_logits",
                    TensorField(
                        torch.tensor(span_start_teacher_logits,
                                     dtype=torch.float32)))
                instance.add_field(
                    "span_end_teacher_logits",
                    TensorField(
                        torch.tensor(span_end_teacher_logits,
                                     dtype=torch.float32)))

                # check to make sure length of teacher logits is correct
                assert len(span_start_teacher_logits) == len(
                    tokenized_paragraph)
                assert len(span_end_teacher_logits) == len(tokenized_paragraph)

            except AssertionError:
                # if length of eacher logits is incorrect, save information about the problematic data point
                count_mismatch += 1
                if flag:
                    with open("mismatch_errors.csv", "w") as fp:
                        flag = False
                        writer = csv.writer(fp)
                        writer.writerow([
                            "id", "text", "len tokenized_paragraph",
                            "len span_start_teacher_logits",
                            "len span_end_teacher_logits"
                        ])
                        writer.writerow([
                            additional_metadata["id"], paragraph,
                            len(tokenized_paragraph),
                            len(span_start_teacher_logits),
                            len(span_end_teacher_logits)
                        ])
                else:
                    with open("mismatch_errors.csv", "a") as fp:
                        writer = csv.writer(fp)
                        writer.writerow([
                            additional_metadata["id"], paragraph,
                            len(tokenized_paragraph),
                            len(span_start_teacher_logits),
                            len(span_end_teacher_logits)
                        ])

                # exclude problematic data point from training
                instance = None

            except:
                # something else whent wrong, exclude data point from training
                print("ERROR! skipped datapoint:", i, datapoint.at["qas_id"],
                      answer_texts, span_starts)
                instance = None

            # if nothing went wrong, yield the instance that was generated
            if instance is not None:
                yield instance

        if not flag:
            print("Number of logit length mismatches (data points skipped): ",
                  count_mismatch, "/", count_total)
    def text_to_instance(  # type: ignore
        self,
        caption_dicts: List[Dict[str, Any]],
        image_index: int,
        caption_index: int,
        features_list: List[TensorField] = [],
        coordinates_list: List[TensorField] = [],
        masks_list: List[TensorField] = [],
        hard_negative_features: Optional[Tensor] = None,
        hard_negative_coordinates: Optional[Tensor] = None,
        label: int = 0,
    ):
        if self.is_evaluation:
            caption_fields = [
                TextField(
                    self._tokenizer.tokenize(
                        caption_dicts[image_index]["captions"][caption_index]),
                    None,
                )
            ] * len(caption_dicts)

            return Instance({
                "caption": ListField(caption_fields),
                "box_features": ListField(features_list),
                "box_coordinates": ListField(coordinates_list),
                "box_mask": ListField(masks_list),
                "label": LabelField(label, skip_indexing=True),
            })

        else:
            # 1. Correct answer
            caption_field = TextField(
                self._tokenizer.tokenize(
                    caption_dicts[image_index]["captions"][caption_index]),
                None,
            )
            caption_fields = [caption_field]
            features = [features_list[image_index]]
            coords = [coordinates_list[image_index]]
            masks = [masks_list[image_index]]

            # 2. Correct image, random wrong caption
            random_image_index = randint(0, len(caption_dicts) - 2)
            if random_image_index == image_index:
                random_image_index += 1
            random_caption_index = randint(0, 4)

            caption_fields.append(
                TextField(
                    self._tokenizer.tokenize(
                        caption_dicts[random_image_index]["captions"]
                        [random_caption_index]),
                    None,
                ))
            features.append(features_list[image_index])
            coords.append(coordinates_list[image_index])
            masks.append(masks_list[image_index])

            # 3. Random wrong image, correct caption
            wrong_image_index = randint(0, len(features_list) - 2)
            if wrong_image_index == image_index:
                wrong_image_index += 1

            caption_fields.append(caption_field)
            features.append(features_list[wrong_image_index])
            coords.append(coordinates_list[wrong_image_index])
            masks.append(masks_list[wrong_image_index])

            # 4. Hard negative image, correct caption
            caption_fields.append(caption_field)
            features.append(TensorField(hard_negative_features))
            coords.append(TensorField(hard_negative_coordinates))
            masks.append(
                ArrayField(
                    hard_negative_features.new_ones(
                        (hard_negative_features.shape[0], ),
                        dtype=torch.bool,
                    ),
                    padding_value=False,
                    dtype=torch.bool,
                ))

            return Instance({
                "caption": ListField(caption_fields),
                "box_features": ListField(features),
                "box_coordinates": ListField(coords),
                "box_mask": ListField(masks),
                "label": LabelField(label, skip_indexing=True),
            })
Exemple #25
0
 def test_as_tensor_works_with_scalar(self):
     array = TensorField(numpy.asarray(42))
     returned_tensor = array.as_tensor(array.get_padding_lengths())
     current_tensor = numpy.asarray(42)
     numpy.testing.assert_array_equal(returned_tensor, current_tensor)
Exemple #26
0
 def test_as_tensor_with_scalar_keeps_dtype(self):
     array = TensorField(numpy.asarray(42, dtype=numpy.float32))
     returned_tensor = array.as_tensor(array.get_padding_lengths())
     assert returned_tensor.dtype == torch.float32