def test_label_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("entailment", namespace="labels")
        vocab.add_token_to_namespace("contradiction", namespace="labels")
        vocab.add_token_to_namespace("neutral", namespace="labels")

        label = LabelField("entailment")
        label.index(vocab)
        tensor = label.as_tensor(label.get_padding_lengths()).data.cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0]))
    def test_label_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("entailment", namespace="labels")
        vocab.add_token_to_namespace("contradiction", namespace="labels")
        vocab.add_token_to_namespace("neutral", namespace="labels")

        label = LabelField("entailment")
        label.index(vocab)
        tensor = label.as_tensor(label.get_padding_lengths())
        assert tensor.item() == 0
Exemple #3
0
 def test_label_field_empty_field_works(self):
     label = LabelField("test")
     empty_label = label.empty_field()
     assert empty_label.label == -1
Exemple #4
0
 def test_label_field_raises_with_non_integer_labels_and_no_indexing(self):
     with pytest.raises(ConfigurationError):
         _ = LabelField("non integer field", skip_indexing=True)
Exemple #5
0
 def test_pad_returns_one_hot_array(self):
     label = LabelField(5, num_labels=10)
     array = label.as_array(label.get_padding_lengths())
     numpy.testing.assert_array_almost_equal(
         array, numpy.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0]))
    def text_to_instance(
            self,  # type: ignore
            premises: Union[List[str], List[List[str]]],
            choices: List[str],
            coverage: List[List[float]],
            label: int = None,
            question: str = None) -> Instance:
        number_of_choices = len(choices)
        if isinstance(premises[0], str):
            premises = [premises] * number_of_choices

        # create an empty dictionary to store the input
        fields: Dict[str, Field] = {}

        all_links = []
        all_link_token_ids = []

        if len(coverage) != len(choices):
            logger.error("the dimension of coverage and choices did not match")
            exit(0)

        max_len = 0
        max_premises = 0
        for arr, p in zip(coverage, premises):

            if len(arr) != len(p):
                logger.error(
                    "the dimension of coverage and premises did not match")
                exit(0)
            max_premises = max([max_premises, len(p)])
            max_len = max([max_len, max([len(a) for a in arr])])

        # padding
        np_coverage = np.zeros([len(coverage), max_premises, max_len])
        for c_idx in range(len(coverage)):
            for p_idx in range(len(coverage[c_idx])):
                np_coverage[
                    c_idx, p_idx,
                    0:len(coverage[c_idx][p_idx])] = coverage[c_idx][p_idx]

        fields['coverage'] = ArrayField(np_coverage)

        for premise, hypothesis in zip(premises, choices):

            # two major keys
            # ph: [cls]all_premise[sep]hypothesis[sep]
            # two different segment_ids
            # join all premise sentences
            links_segment_2d = []
            links_2d = []

            for i in range(0, len(premise)):
                tokenized_links_field = []
                type_ids_of_links = []
                for j in range(0, len(premise)):
                    if i == j:
                        continue
                    else:
                        if question is None:
                            pp_tokens, pp_token_type_ids = self.bert_features_from_qa(
                                question=premise[i],
                                answer=hypothesis,
                                context=premise[j])
                        else:
                            pp_tokens, pp_token_type_ids = self.bert_features_from_qa(
                                question=question,
                                context2=premise[j],
                                answer=hypothesis,
                                context=premise[i])
                        pp_tokens_field = TextField(pp_tokens,
                                                    self._token_indexers)
                        tokenized_links_field.append(pp_tokens_field)
                        type_ids_of_links.append(
                            SequenceLabelField(pp_token_type_ids,
                                               pp_tokens_field))
                links_2d.append(ListField(tokenized_links_field))
                links_segment_2d.append(ListField(type_ids_of_links))

            if len(premise) >= 2:
                all_links.append(ListField(links_2d))
                all_link_token_ids.append(ListField(links_segment_2d))
            else:
                # add an empty list field
                empty_tokens_field = [TextField([], self._token_indexers)]
                empty_type_ids_of_links = [
                    SequenceLabelField([], empty_tokens_field[0])
                ]
                all_links.append(ListField(ListField(empty_tokens_field)))
                all_link_token_ids.append(
                    ListField(ListField(empty_type_ids_of_links)))

        if label is not None:
            fields['label'] = LabelField(label, skip_indexing=True)

        fields['links_tokens'] = ListField(all_links)
        fields['links_token_type_ids'] = ListField(all_link_token_ids)

        return Instance(fields)
Exemple #7
0
    def make_marginal_drop_instance(question_tokens: List[Token],
                                    passage_tokens: List[Token],
                                    number_tokens: List[Token],
                                    number_indices: List[int],
                                    token_indexers: Dict[str, TokenIndexer],
                                    passage_text: str,
                                    answer_info: Dict[str, Any] = None,
                                    additional_metadata: Dict[str, Any] = None) -> Instance:
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        question_offsets = [(token.idx, token.idx + len(token.text)) for token in question_tokens]

        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        question_field = TextField(question_tokens, token_indexers)
        fields["passage"] = passage_field
        fields["question"] = question_field
        number_index_fields: List[Field] = [IndexField(index, passage_field) for index in number_indices]
        fields["number_indices"] = ListField(number_index_fields)
        # This field is actually not required in the model,
        # it is used to create the `answer_as_plus_minus_combinations` field, which is a `SequenceLabelField`.
        # We cannot use `number_indices` field for creating that, because the `ListField` will not be empty
        # when we want to create a new empty field. That will lead to error.
        numbers_in_passage_field = TextField(number_tokens, token_indexers)
        metadata = {"original_passage": passage_text,
                    "passage_token_offsets": passage_offsets,
                    "question_token_offsets": question_offsets,
                    "question_tokens": [token.text for token in question_tokens],
                    "passage_tokens": [token.text for token in passage_tokens],
                    "number_tokens": [token.text for token in number_tokens],
                    "number_indices": number_indices}
        if answer_info:
            metadata["answer_texts"] = answer_info["answer_texts"]

            passage_span_fields: List[Field] = \
                [SpanField(span[0], span[1], passage_field) for span in answer_info["answer_passage_spans"]]
            if not passage_span_fields:
                passage_span_fields.append(SpanField(-1, -1, passage_field))
            fields["answer_as_passage_spans"] = ListField(passage_span_fields)

            question_span_fields: List[Field] = \
                [SpanField(span[0], span[1], question_field) for span in answer_info["answer_question_spans"]]
            if not question_span_fields:
                question_span_fields.append(SpanField(-1, -1, question_field))
            fields["answer_as_question_spans"] = ListField(question_span_fields)

            add_sub_signs_field: List[Field] = []
            for signs_for_one_add_sub_expression in answer_info["signs_for_add_sub_expressions"]:
                add_sub_signs_field.append(SequenceLabelField(signs_for_one_add_sub_expression,
                                                              numbers_in_passage_field))
            if not add_sub_signs_field:
                add_sub_signs_field.append(SequenceLabelField([0] * len(number_tokens),
                                                              numbers_in_passage_field))
            fields["answer_as_add_sub_expressions"] = ListField(add_sub_signs_field)

            count_fields: List[Field] = [LabelField(count_label, skip_indexing=True)
                                         for count_label in answer_info["counts"]]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)

        metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)
Exemple #8
0
    def __getitem__(self, index):
        item = json.loads(self.items[index])
        instance_dict = {}
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)
        vcr_tokenizer = VCRTokenizer(old_det_to_new_ind, item['objects'], self.add_image_as_a_box)

        ######################################以下是Q2A的数据处理部分##################################################

        with h5py.File(self.h5fn_answer, 'r') as h5:
            grp_items_answer = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}            # (n, 768) dict_keys(['answer_answer0', 'answer_answer1', 'answer_answer2', 'answer_answer3', 'ctx_answer0', 'ctx_answer1', 'ctx_answer2', 'ctx_answer3'])        ['answer_rationale0', 'answer_rationale1', 'answer_rationale2', 'answer_rationale3', 'ctx_rationale0', 'ctx_rationale1', 'ctx_rationale2', 'ctx_rationale3']

        if 'endingonly' not in self.embs_to_load:
            questions_answer_tokenized, question_answer_tags = zip(*[vcr_tokenizer(
                item['question'],
                grp_items_answer[f'ctx_answer{i}']
            ) for i in range(4)])
            instance_dict['question_answer'] = ListField(list(questions_answer_tokenized))
            instance_dict['question_answer_tags'] = ListField(list(question_answer_tags))

        answers_tokenized, answer_tags = zip(*[vcr_tokenizer(
            answer,
            grp_items_answer[f'answer_answer{i}']
        ) for i, answer in enumerate(item['answer_choices'])])

        instance_dict['answers'] = ListField(list(answers_tokenized))
        instance_dict['answer_tags'] = ListField(list(answer_tags))


        ######################################以下是QA2R的数据处理部分################################################
        with h5py.File(self.h5fn_rationale, 'r') as h5_rationale:
            grp_items_rationale = {k: np.array(v, dtype=np.float16) for k, v in h5_rationale[str(index)].items()}

        condition_key = self.conditioned_answer_choice if self.split == "test" else ""
        conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
        question_rationale = item['question'] + item['answer_choices'][conditioned_label]

        if 'endingonly' not in self.embs_to_load:
            questions_rationale_tokenized, question_rationale_tags = zip(*[vcr_tokenizer(
                question_rationale,
                grp_items_rationale[f'ctx_rationale{condition_key}{i}']
            ) for i in range(4)])
            instance_dict['question_rationale'] = ListField(list(questions_rationale_tokenized))
            instance_dict['question_rationale_tags'] = ListField(list(question_rationale_tags))

        rationale_tokenized, rationale_tags = zip(*[vcr_tokenizer(
            rationale,
            grp_items_rationale[f'answer_rationale{condition_key}{i}']
        ) for i, rationale in enumerate(item['rationale_choices'])])

        instance_dict['rationales'] = ListField(list(rationale_tokenized))
        instance_dict['rationale_tags'] = ListField(list(rationale_tags))

        ####################################各种metadata数据处理部分##################################################
        if self.split != 'test':
            instance_dict['answer_label'] = LabelField(item['answer_label'], skip_indexing=True)
            instance_dict['rationale_label'] = LabelField(item['rationale_label'], skip_indexing=True)
        # instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'],
        #                                            'img_fn': item['img_fn'],
        #                                            'question_number': item['question_number']})

        ##########################################图片处理部分########################################################
        with h5py.File(self.h5fn_image, 'r') as h5_features:
            # pytoch1.1
            img_id = item['img_id'].split('-')[-1]
            group_image = {k: np.array(v) for k, v in h5_features[img_id].items()}
            image_feature = group_image['features'][[0]+(dets2use+1).tolist()]
            tag_boxes = group_image['boxes']
        zeros = np.zeros((1,2048), dtype=np.float32)
        if self.add_image_as_a_box:
            image_feature = np.concatenate((zeros, image_feature), axis=0)
        else:
            image_feature = np.concatenate((zeros, image_feature[1:]), axis=0)
        instance_dict['image_features'] = ArrayField(image_feature, padding_value=0)

        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()]
        if self.add_image_as_a_box:
            boxes = np.row_stack((boxes[0], boxes))
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels
        # 第一个object是0
        boxes = np.row_stack((boxes[0], boxes))
        obj_labels = [81] + obj_labels

        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)
        instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])
        assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2]))

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return instance
Exemple #9
0
 def get_abstract_slot_value_field(slot_name, get_abstracted_value):
     abst_slot_name = "abst-%s" % slot_name
     namespace = get_slot_label_namespace(abst_slot_name)
     abst_slot_value = get_abstracted_value(question_label["questionSlots"][slot_name])
     return LabelField(label = abst_slot_value, label_namespace = namespace)
 def test_instances_must_have_homogeneous_fields(self):
     instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
     instance2 = Instance({"words": TextField([Token("hello")], {})})
     with pytest.raises(ConfigurationError):
         _ = Batch([instance1, instance2])
    def text_to_instance(
            self,  # type: ignore
            premise: List[Tuple[str, float]],  # Important type information
            hypothesis: str,
            pid: str = None,
            label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        if self.shuffle_sentences:
            # Potential improvement. Shuffle the input sentences. Maybe close this at last several epoch.
            random.shuffle(premise)

        premise_prob_list = []
        premise_tokens_list = []

        for premise_sent, prob in premise:
            tokenized_cur_sent = self.bert_servant.tokenize(
                premise_sent, modify_from_corenlp=True)
            # cur_sent_ids = self.bert_servant.tokens_to_ids(tokenized_cur_sent)

            if self.max_l is not None:
                tokenized_cur_sent = tokenized_cur_sent[:self.
                                                        max_l]  # truncate max length (default 60)

            premise_tokens_list.extend(tokenized_cur_sent)
            prob_value = np.ones(
                (len(tokenized_cur_sent), 1), dtype=np.float32) * prob
            premise_prob_list.append(prob_value)

        premise_prob = np.concatenate(premise_prob_list, axis=0)
        # premise_tokens_id_list = self.bert_servant.tokens_to_ids(premise_tokens_list)

        hypothesis_tokens_list = self.bert_servant.tokenize(
            hypothesis, modify_from_corenlp=True)

        # print("WTF!!!, p", len(premise_tokens_list))
        # print("WTF!!!, h", len(hypothesis_tokens_list))

        if self.max_l is not None:
            hypothesis_tokens_list = hypothesis_tokens_list[:self.max_l]

        hypothesis_prob = np.ones((len(hypothesis_tokens_list), 1),
                                  dtype=np.float32)

        assert len(premise_tokens_list) == len(premise_prob)
        assert len(hypothesis_tokens_list) == len(hypothesis_prob)

        paired_tokens_sequence = ['[CLS]'] + premise_tokens_list + [
            '[SEP]'
        ] + hypothesis_tokens_list + ['[SEP]']
        token_type_ids = [0] * (2 + len(premise_tokens_list)) + [1] * (
            1 + len(hypothesis_tokens_list))

        paired_ids_seq = self.bert_servant.tokens_to_ids(
            paired_tokens_sequence)
        assert len(paired_ids_seq) == len(token_type_ids)
        fields['paired_sequence'] = BertIndexField(
            np.asarray(paired_ids_seq, dtype=np.int64))
        fields['paired_token_type_ids'] = BertIndexField(
            np.asarray(token_type_ids, dtype=np.int64))

        premise_span = (1, 1 + len(premise_tokens_list)
                        )  # End is exclusive (important for later use)
        hypothesis_span = (premise_span[1] + 1,
                           premise_span[1] + 1 + len(hypothesis_tokens_list))

        assert len(paired_ids_seq) == 1 + (premise_span[1] - premise_span[0]) + 1 + \
               (hypothesis_span[1] - hypothesis_span[0]) + 1

        fields['bert_premise_span'] = MetadataField(premise_span)
        fields['bert_hypothesis_span'] = MetadataField(hypothesis_span)

        fields['premise_probs'] = MetadataField(premise_prob)
        fields['hypothesis_probs'] = MetadataField(hypothesis_prob)

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields)
Exemple #12
0
    def text_to_instance(self,
                         content: str,
                         candidates: List[str],
                         ground_truths: List[str] = None,
                         real_count: int = 1) -> Iterable[Instance]:
        splits = re.split(r'#idiom#', content)
        assert real_count + 1 == len(splits)
        assert real_count == len(candidates)
        split_tokens = [self.tokenizer.tokenize(item) for item in splits]
        for index, current_candidates in enumerate(candidates):
            before_part_tokens = [Token(token) for token in split_tokens[0]]
            for before_part in split_tokens[1:index + 1]:
                before_part_tokens += [Token('[UNK]')] + [
                    Token(token) for token in before_part
                ]
            after_part_tokens = [
                Token(token) for token in split_tokens[index + 1]
            ]
            for after_part in split_tokens[index + 2:]:
                after_part_tokens += [Token('[UNK]')] + [
                    Token(token) for token in after_part
                ]

            # 将 留空处 打上 [MASK]标记
            content_tokens = before_part_tokens + [Token('[MASK]')
                                                   ] + after_part_tokens

            # 取 留空 前后最多max_seq_length的内容作为输入
            half_length = self.max_seq_length // 2
            if len(before_part_tokens) < half_length:
                start = 0
                end = min(
                    len(before_part_tokens) + 1 + len(after_part_tokens),
                    self.max_seq_length - 2)
            elif len(after_part_tokens) < half_length:
                end = len(before_part_tokens) + 1 + len(after_part_tokens)
                start = max(0, end - (self.max_seq_length - 2))
            else:
                start = len(before_part_tokens) + 3 - half_length
                end = len(before_part_tokens) + 1 + half_length

            content_tokens = content_tokens[start:end]

            # 填空内容
            content_field = TextField(content_tokens,
                                      self.content_token_indexer)

            # 留空 的位置
            blank_index = content_tokens.index(Token("[MASK]"))
            blank_index_field = IndexField(blank_index, content_field)

            # 候选成语
            candidate_tokens = [
                self.idiom_list.index(option) for option in current_candidates
            ]
            candidate_tokens = np.array(candidate_tokens)
            candidate_field = ArrayField(candidate_tokens, dtype=np.long)

            fields = {
                "content": content_field,
                "blank_indices": blank_index_field,
                "candidates": candidate_field,
            }

            if ground_truths:
                label = current_candidates.index(ground_truths[index])
                label_field = LabelField(label, skip_indexing=True)
                fields["answer"] = label_field

                # 元信息
                meta = {
                    "content":
                    '[UNK]'.join(splits[:index + 1]) + "[MASK]" +
                    '[UNK]'.join(splits[index + 1:]),
                    "candidates":
                    current_candidates,
                    "answer":
                    ground_truths[index]
                }
            else:
                meta = {
                    "content":
                    '[UNK]'.join(splits[:index + 1]) + "[MASK]" +
                    '[UNK]'.join(splits[index + 1:]),
                    "candidates":
                    current_candidates,
                }
            fields["meta"] = MetadataField(meta)

            yield Instance(fields)
Exemple #13
0
    def text_to_instance(
        self,
        annotation_id: str,
        documents: Dict[str, List[str]],
        rationales: Dict[str, List[Tuple[int, int]]],
        query: str = None,
        label: str = None,
    ) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        fields = {}

        tokens = []
        is_evidence = []

        document_to_span_map = {}

        for docid, docwords in documents.items():
            tokens += [Token(word) for word in docwords]
            document_to_span_map[docid] = (len(tokens) - len(docwords),
                                           len(tokens))

            tokens.append(Token("[SEP]"))

            rationale = [0] * len(docwords)
            if docid in rationales:
                for s, e in rationales[docid]:
                    for i in range(s, e):
                        rationale[i] = 1

            is_evidence += rationale + [1]

        if query is not None and type(query) != list:
            query_words = query.split()
            tokens += [Token(word) for word in query_words]
            tokens.append(Token("[SEP]"))
            is_evidence += [1] * (len(query_words) + 1)

        always_keep_mask = [
            1 if t.text.upper() == "[SEP]" else 0 for t in tokens
        ]

        fields["document"] = TextField(tokens, self._token_indexers)
        fields["rationale"] = SequenceLabelField(
            is_evidence,
            sequence_field=fields["document"],
            label_namespace="evidence_labels")
        fields["kept_tokens"] = SequenceLabelField(
            always_keep_mask,
            sequence_field=fields["document"],
            label_namespace="kept_token_labels")

        metadata = {
            "annotation_id": annotation_id,
            "tokens": tokens,
            "document_to_span_map": document_to_span_map,
            "convert_tokens_to_instance": self.convert_tokens_to_instance,
        }

        fields["metadata"] = MetadataField(metadata)

        if label is not None:
            fields["label"] = LabelField(label, label_namespace="labels")

        return Instance(fields)
Exemple #14
0
 def text_to_instance(
         self,
         tokens: List[Token],
         spans: List[Tuple[str, Tuple[int, int]]],  # end ind is exclusive
         span_pairs: List[Tuple[str, str]],
         span_weights: List[float],
         task: str = None,
         span_labels: List[str] = None,
         span_pair_labels: List[str] = None,
         e2e: bool = False,
         **kwargs) -> Instance:
     text_field = TextField(tokens, token_indexers=self._token_indexers)
     # Spans must be ordered by the end index because we might need to
     # remove some of them during forward computation mainly because of
     # the length constraints introduced by models like BERT. In this case,
     # we hope all the removed spans are located at the end of the list.
     spans_ind = sorted(range(len(spans)),
                        key=lambda i: (spans[i][1][1], spans[i][1][0]))
     spans = [spans[i] for i in spans_ind]
     sid2ind = dict((s[0], i) for i, s in enumerate(spans))
     span_field = ListField([
         SpanField(sind, eind - 1, text_field, check_sentence=False)
         for sid, (sind, eind) in spans
     ])
     task = self._default_task if task is None else task
     task_field = LabelField(task, label_namespace='task_labels')
     if len(span_pairs) > 0:
         span_pair_field = ListField([
             ArrayField(
                 np.array([sid2ind[sid1], sid2ind[sid2]], dtype=np.int64),
                 padding_value=-1,  # the same as span field
                 dtype=np.int64) for sid1, sid2 in span_pairs
         ])
     else:
         span_pair_field = ListField([
             ArrayField(
                 np.array(
                     [-1, -1],
                     dtype=np.int64),  # use a padding sample as placeholder
                 padding_value=-1,
                 dtype=np.int64)
         ])
     assert len(spans) == len(
         span_weights), 'input and weights length inconsistent'
     span_weights = [span_weights[i] for i in spans_ind
                     ]  # to be consistent with sorted spans
     span_weights_field = ArrayField(np.array(span_weights,
                                              dtype=np.float32),
                                     padding_value=0,
                                     dtype=np.float32)
     fields: Dict[str, Field] = {
         'text': text_field,
         'spans': span_field,
         'task_labels': task_field,
         'span_pairs': span_pair_field,
         'span_weights': span_weights_field
     }
     if span_labels is not None:
         # TODO debug (consti label transformation)
         '''
         def consti_map(sp):
             if sp == 'NP' or sp == BratDoc.NEG_SPAN_LABEL:
                 return sp
             return 'S'
         if task == 'consti':
             span_labels = [consti_map(sp) for sp in span_labels]
         '''
         assert len(spans) == len(
             span_labels), 'input and label length inconsistent'
         span_labels = [span_labels[i] for i in spans_ind
                        ]  # to be consistent with sorted spans
         fields['span_labels'] = SequenceLabelField(
             span_labels,
             span_field,
             label_namespace='{}_span_labels'.format(task))
     if span_pair_labels is not None:
         if len(span_pairs) > 0:
             assert len(span_pairs) == len(
                 span_pair_labels), 'input and label length inconsistent'
             fields['span_pair_labels'] = SequenceLabelField(
                 span_pair_labels,
                 span_pair_field,
                 label_namespace='{}_span_pair_labels'.format(task))
         else:
             fields['span_pair_labels'] = SequenceLabelField(
                 [self.PADDING_LABEL],
                 span_pair_field,
                 label_namespace='{}_span_pair_labels'.format(task))
     # add meta filed
     # e2e is used in forward to decide whether to use end2end training/testing
     metadata_dict: Dict[str, Any] = {
         'task': task,
         'e2e': e2e,
         'max_span_width': self._max_span_width[task]
     }
     if 'brat_doc' in kwargs:
         metadata_dict['clusters'] = kwargs['brat_doc'].build_cluster(
             inclusive=True)
     metadata_dict.update(kwargs)
     fields['metadata'] = MetadataField(metadata_dict)
     return Instance(fields)
Exemple #15
0
    def data_to_instance(self, words: List[str], labels: List[str],
                         sentence_boundaries: List[int], doc_index: str):
        if self.tokenizer is None:
            tokens = [[Token(w)] for w in words]
        else:
            tokens = [self.tokenizer.tokenize(w) for w in words]
        subwords = [sw for token in tokens for sw in token]

        subword2token = list(
            itertools.chain(*[[i] * len(token)
                              for i, token in enumerate(tokens)]))
        token2subword = [0] + list(
            itertools.accumulate(len(token) for token in tokens))
        subword_start_positions = frozenset(token2subword)
        subword_sentence_boundaries = [
            sum(len(token) for token in tokens[:p])
            for p in sentence_boundaries
        ]

        # extract entities from IOB tags
        # we need to pass sentence by sentence
        entities: List[Entity] = []
        for s, e in zip(sentence_boundaries[:-1], sentence_boundaries[1:]):
            for ent in Entities([labels[s:e]], scheme=IOB1).entities[0]:
                ent.start += s
                ent.end += s
                entities.append(ent)

        span_to_entity_label: Dict[Tuple[int, int], str] = dict()
        for ent in entities:
            subword_start = token2subword[ent.start]
            subword_end = token2subword[ent.end]
            span_to_entity_label[(subword_start, subword_end)] = ent.tag

        # split data according to sentence boundaries
        for n in range(len(subword_sentence_boundaries) - 1):
            # process (sub) words
            doc_sent_start, doc_sent_end = subword_sentence_boundaries[n:n + 2]
            assert doc_sent_end - doc_sent_start < self.max_num_subwords

            left_length = doc_sent_start
            right_length = len(subwords) - doc_sent_end
            sentence_length = doc_sent_end - doc_sent_start
            half_context_length = int(
                (self.max_num_subwords - sentence_length) / 2)

            if left_length < right_length:
                left_context_length = min(left_length, half_context_length)
                right_context_length = min(
                    right_length, self.max_num_subwords - left_context_length -
                    sentence_length)
            else:
                right_context_length = min(right_length, half_context_length)
                left_context_length = min(
                    left_length, self.max_num_subwords - right_context_length -
                    sentence_length)

            doc_offset = doc_sent_start - left_context_length
            word_ids = subwords[doc_offset:doc_sent_end + right_context_length]

            if isinstance(self.tokenizer, PretrainedTransformerTokenizer):
                word_ids = self.tokenizer.add_special_tokens(word_ids)

            # process entities
            entity_start_positions = []
            entity_end_positions = []
            entity_ids = []
            entity_position_ids = []
            original_entity_spans = []
            labels = []

            for entity_start in range(left_context_length,
                                      left_context_length + sentence_length):
                doc_entity_start = entity_start + doc_offset
                if doc_entity_start not in subword_start_positions:
                    continue
                for entity_end in range(
                        entity_start + 1,
                        left_context_length + sentence_length + 1):
                    doc_entity_end = entity_end + doc_offset
                    if doc_entity_end not in subword_start_positions:
                        continue

                    if entity_end - entity_start > self.max_mention_length:
                        continue

                    entity_start_positions.append(entity_start + 1)
                    entity_end_positions.append(entity_end)
                    entity_ids.append(self.entity_id)

                    position_ids = list(range(entity_start + 1,
                                              entity_end + 1))
                    position_ids += [-1] * (self.max_mention_length -
                                            entity_end + entity_start)
                    entity_position_ids.append(position_ids)

                    original_entity_spans.append(
                        (subword2token[doc_entity_start],
                         subword2token[doc_entity_end - 1] + 1))
                    labels.append(
                        span_to_entity_label.pop(
                            (doc_entity_start, doc_entity_end), NON_ENTITY))

            # split instances
            split_size = math.ceil(len(entity_ids) / self.max_entity_length)
            for i in range(split_size):
                entity_size = math.ceil(len(entity_ids) / split_size)
                start = i * entity_size
                end = start + entity_size
                fields = {
                    "word_ids":
                    TextField(word_ids, token_indexers=self.token_indexers),
                    "entity_start_positions":
                    TensorField(np.array(entity_start_positions[start:end])),
                    "entity_end_positions":
                    TensorField(np.array(entity_end_positions[start:end])),
                    "original_entity_spans":
                    TensorField(np.array(original_entity_spans[start:end]),
                                padding_value=-1),
                    "labels":
                    ListField([LabelField(l) for l in labels[start:end]]),
                    "doc_id":
                    MetadataField(doc_index),
                    "input_words":
                    MetadataField(words),
                }

                if self.use_entity_feature:
                    fields.update({
                        "entity_ids":
                        TensorField(np.array(entity_ids[start:end]),
                                    padding_value=0),
                        "entity_position_ids":
                        TensorField(np.array(entity_position_ids[start:end])),
                    })

                yield Instance(fields)

        assert len(span_to_entity_label) == 0
Exemple #16
0
def get_clause_slot_field(slot_name: str, slot_value: str):
    clause_slot_name = "clause-%s" % slot_name
    namespace = get_slot_label_namespace(clause_slot_name)
    return LabelField(label = slot_value, label_namespace = namespace)
Exemple #17
0
def get_num_answers_field(question_label):
    return LabelField(label = len(question_label["answerJudgments"]), skip_indexing = True)
Exemple #18
0
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        new_instance = deepcopy(instance)
        # For BiDAF
        if "best_span" in outputs:
            span_start_label = outputs["best_span"][0]
            span_end_label = outputs["best_span"][1]
            passage_field: SequenceField = new_instance[
                "passage"]  # type: ignore
            new_instance.add_field(
                "span_start", IndexField(int(span_start_label), passage_field))
            new_instance.add_field(
                "span_end", IndexField(int(span_end_label), passage_field))

        # For NAQANet model. It has the fields: answer_as_passage_spans, answer_as_question_spans,
        # answer_as_add_sub_expressions, answer_as_counts. We need labels for all.
        elif "answer" in outputs:
            answer_type = outputs["answer"]["answer_type"]

            # When the problem is a counting problem
            if answer_type == "count":
                field = ListField([
                    LabelField(int(outputs["answer"]["count"]),
                               skip_indexing=True)
                ])
                new_instance.add_field("answer_as_counts", field)

            # When the answer is in the passage
            elif answer_type == "passage_span":
                # TODO(mattg): Currently we only handle one predicted span.
                span = outputs["answer"]["spans"][0]

                # Convert character span indices into word span indices
                word_span_start = None
                word_span_end = None
                offsets = new_instance["metadata"].metadata[
                    "passage_token_offsets"]  # type: ignore
                for index, offset in enumerate(offsets):
                    if offset[0] == span[0]:
                        word_span_start = index
                    if offset[1] == span[1]:
                        word_span_end = index

                passage_field: SequenceField = new_instance[
                    "passage"]  # type: ignore
                field = ListField(
                    [SpanField(word_span_start, word_span_end, passage_field)])
                new_instance.add_field("answer_as_passage_spans", field)

            # When the answer is an arithmetic calculation
            elif answer_type == "arithmetic":
                # The different numbers in the passage that the model encounters
                sequence_labels = outputs["answer"]["numbers"]

                numbers_field: ListField = instance[
                    "number_indices"]  # type: ignore

                # The numbers in the passage are given signs, that's what we are labeling here.
                # Negative signs are given the class label 2 (for 0 and 1, the sign matches the
                # label).
                labels = []
                for label in sequence_labels:
                    if label["sign"] == -1:
                        labels.append(2)
                    else:
                        labels.append(label["sign"])
                # There's a dummy number added in the dataset reader to handle passages with no
                # numbers; it has a label of 0 (not included).
                labels.append(0)

                field = ListField([SequenceLabelField(labels, numbers_field)])
                new_instance.add_field("answer_as_add_sub_expressions", field)

            # When the answer is in the question
            elif answer_type == "question_span":
                span = outputs["answer"]["spans"][0]

                # Convert character span indices into word span indices
                word_span_start = None
                word_span_end = None
                question_offsets = new_instance[
                    "metadata"].metadata[  # type: ignore
                        "question_token_offsets"]
                for index, offset in enumerate(question_offsets):
                    if offset[0] == span[0]:
                        word_span_start = index
                    if offset[1] == span[1]:
                        word_span_end = index

                question_field: SequenceField = new_instance[
                    "question"]  # type: ignore
                field = ListField([
                    SpanField(word_span_start, word_span_end, question_field)
                ])
                new_instance.add_field("answer_as_question_spans", field)

        return [new_instance]
Exemple #19
0
def get_num_valids_field(question_label):
    return LabelField(label = len([aj for aj in question_label["answerJudgments"] if aj["isValid"]]), skip_indexing = True)
Exemple #20
0
 def text_to_instance(self, index: int) -> Instance:  # type: ignore
     return Instance({"index": LabelField(index, skip_indexing=True)})
Exemple #21
0
def get_num_invalids_field(question_label):
    return LabelField(label = get_num_invalids(question_label), skip_indexing = True)
Exemple #22
0
    def get_answer_fields(self,
                **kwargs: Dict[str, Any]) -> Tuple[Dict[str, Field], bool]:
        seq_tokens: List[Token] = kwargs['seq_tokens']
        seq_wordpieces: int = kwargs['seq_wordpieces']
        question_text_index_to_token_index: List[int] = kwargs['question_text_index_to_token_index']
        question_text: str = kwargs['question_text']
        passage_text_index_to_token_index: List[int] = kwargs['passage_text_index_to_token_index']
        passage_text: str = kwargs['passage_text']
        answer_texts: List[str] = kwargs['answer_texts']
        gold_indexes: Dict[List[int]] = (kwargs['gold_indexes'] if 'gold_indexes' in kwargs 
                                          else {'question': None, 'passage': None})

        fields: Dict[str, Field] = {}

        spans_dict = {}
        all_spans = []
        is_missing_answer = False
        for i, answer_text in enumerate(answer_texts):
            answer_spans = []
            if not self._ignore_question:
                answer_spans += find_valid_spans(question_text, [answer_text], 
                                                question_text_index_to_token_index, 
                                                seq_tokens, seq_wordpieces, 
                                                gold_indexes['question'])
            answer_spans += find_valid_spans(passage_text, [answer_text], 
                                             passage_text_index_to_token_index, 
                                             seq_tokens, seq_wordpieces, 
                                             gold_indexes['passage'])
            if len(answer_spans) == 0:
                is_missing_answer = True
                continue
            spans_dict[answer_text] = answer_spans
            all_spans.extend(answer_spans)

        old_reader_behavior = kwargs['old_reader_behavior']
        if old_reader_behavior:
            answer_type = kwargs['answer_type']
            is_training = kwargs['is_training']
            if is_training:
                if answer_type in SPAN_ANSWER_TYPES:
                    if is_missing_answer:
                        all_spans = []

        if len(all_spans) > 0:
            has_answer = True

            fields['wordpiece_indices'] = self._get_wordpiece_indices_field(seq_wordpieces)

            no_answer_bios = self._get_empty_answer(seq_tokens)

            text_to_disjoint_bios: List[ListField] = []
            flexibility_count = 1
            for answer_text in answer_texts:
                spans = spans_dict[answer_text] if answer_text in spans_dict else []
                if len(spans) == 0:
                    continue

                disjoint_bios: List[LabelsField] = []
                for span_ind, span in enumerate(spans):
                    bios = self._create_sequence_labels([span], len(seq_tokens))
                    disjoint_bios.append(LabelsField(bios))

                text_to_disjoint_bios.append(ListField(disjoint_bios))
                flexibility_count *= ((2**len(spans)) - 1)

            fields['answer_as_text_to_disjoint_bios'] = ListField(text_to_disjoint_bios)

            if (flexibility_count < self._flexibility_threshold):
                # generate all non-empty span combinations per each text
                spans_combinations_dict = {}
                for key, spans in spans_dict.items():
                    spans_combinations_dict[key] = all_combinations = []
                    for i in range(1, len(spans) + 1):
                        all_combinations += list(itertools.combinations(spans, i))

                # calculate product between all the combinations per each text
                packed_gold_spans_list = itertools.product(*list(spans_combinations_dict.values()))
                bios_list: List[LabelsField] = []
                for packed_gold_spans in packed_gold_spans_list:
                    gold_spans = [s for sublist in packed_gold_spans for s in sublist]
                    bios = self._create_sequence_labels(gold_spans, len(seq_tokens))
                    bios_list.append(LabelsField(bios))

                fields['answer_as_list_of_bios'] = ListField(bios_list)
                fields['answer_as_text_to_disjoint_bios'] = ListField([ListField([no_answer_bios])])
            else:
                fields['answer_as_list_of_bios'] = ListField([no_answer_bios])

            bio_labels = self._create_sequence_labels(all_spans, len(seq_tokens))
            fields['span_bio_labels'] = LabelsField(bio_labels)

            fields['is_bio_mask'] = LabelField(1, skip_indexing=True)
        else:
            has_answer = False
            fields.update(self.get_empty_answer_fields(**kwargs))

        return fields, has_answer
Exemple #23
0
 def get_slot_value_field(slot_name):
     slot_value = question_slots[slot_name]
     namespace = get_slot_label_namespace(slot_name)
     return LabelField(label = slot_value, label_namespace = namespace)
    def text_to_instance(self,  # type: ignore
                         item_id: Any,
                         question_text: str,
                         choice_text_list: List[str],
                         fact_text: str,
                         answer_span: List[str],
                         answer_relations: List[str],
                         answer_starts: List[int] = None,
                         answer_id: int = None,
                         prefetched_sentences: Dict[str, List[str]] = None,
                         prefetched_indices: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question_text)
        fact_tokens = self._tokenizer.tokenize(fact_text)
        choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list]
        choice_kb_fields = []
        selected_tuples = []
        for choice in choice_text_list:
            kb_fields = []

            if self._use_conceptnet and self._use_elastic_search:
                max_sents_per_source = int(self._max_tuples / 2)
            else:
                max_sents_per_source = self._max_tuples
            selected_hits = []
            if self._use_elastic_search:
                elastic_search_hits = self.get_elasticsearch_sentences(prefetched_sentences,
                                                                       prefetched_indices,
                                                                       answer_span, choice,
                                                                       question_text, fact_text,
                                                                       max_sents_per_source)
                selected_hits.extend(elastic_search_hits)

            if self._use_conceptnet:
                conceptnet_sentences = self.get_conceptnet_sentences(fact_text, answer_span, choice,
                                                                     max_sents_per_source)
                selected_hits.extend(conceptnet_sentences)
            # add a dummy entry to capture the embedding link
            if self._ignore_spans:
                fact_choice_sentence = fact_text + " || " + choice
                selected_hits.append(fact_choice_sentence)
            else:
                for answer in set(answer_span):
                    answer_choice_sentence = answer + " || " + choice
                    selected_hits.append(answer_choice_sentence)

            selected_tuples.append(selected_hits)
            for hit_text in selected_hits:
                kb_fields.append(TextField(self._tokenizer.tokenize(hit_text),
                                           self._token_indexers))

            choice_kb_fields.append(ListField(kb_fields))

        fields["choice_kb"] = ListField(choice_kb_fields)
        fields['fact'] = TextField(fact_tokens, self._token_indexers)

        if self._add_relation_labels:
            if answer_relations and len(answer_relations):
                relation_fields = []
                for relation in set(answer_relations):
                    relation_fields.append(LabelField(relation, label_namespace="relation_labels"))
                fields["relations"] = ListField(relation_fields)
                selected_relations = self.collate_relations(answer_relations)
                fields["relation_label"] = MultiLabelField(selected_relations, "relation_labels")
            else:
                fields["relations"] = ListField([LabelField(-1, label_namespace="relation_labels",
                                                            skip_indexing=True)])
                fields["relation_label"] = MultiLabelField([], "relation_labels")

        answer_fields = []
        answer_span_fields = []
        fact_offsets = [(token.idx, token.idx + len(token.text)) for token in fact_tokens]

        for idx, answer in enumerate(answer_span):
            answer_fields.append(TextField(self._tokenizer.tokenize(answer),
                                           self._token_indexers))
            if answer_starts:
                if len(answer_starts) <= idx:
                    raise ValueError("Only {} answer_starts in json. "
                                     "Expected {} in {}".format(len(answer_starts),
                                                                len(answer_span),
                                                                item_id))
                offset = answer_starts[idx]
            else:
                offset = fact_text.index(answer)
                if offset == -1:
                    raise ValueError("Span: {} not found in fact: {}".format(answer, fact_text))

            tok_span, err = char_span_to_token_span(fact_offsets, (offset, offset + len(answer)))
            if err:
                logger.info("Could not find token spans for '{}' in '{}'."
                            "Best guess: {} in {} at {}".format(
                    answer, fact_text, [offset, offset + len(answer)], fact_offsets, tok_span))
            answer_span_fields.append(SpanField(tok_span[0], tok_span[1], fields['fact']))

        fields["answer_text"] = ListField(answer_fields)
        fields["answer_spans"] = ListField(answer_span_fields)
        fields['question'] = TextField(question_tokens, self._token_indexers)

        fields['choices_list'] = ListField(
            [TextField(x, self._token_indexers) for x in choices_tokens_list])
        if answer_id is not None:
            fields['answer_id'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
            "id": item_id,
            "question_text": question_text,
            "fact_text": fact_text,
            "choice_text_list": choice_text_list,
            "question_tokens": [x.text for x in question_tokens],
            "fact_tokens": [x.text for x in fact_tokens],
            "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
            "answer_text": answer_span,
            "answer_start": answer_starts,
            "answer_span_fields": [(x.span_start, x.span_end) for x in answer_span_fields],
            "relations": answer_relations,
            "selected_tuples": selected_tuples
        }

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
Exemple #25
0
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])
        image_id = int(item['img_id'].split('-')[-1])
        anno_id = str(item['annot_id'].split('-')[-1])

        with h5py.File(self.tag_annot_path, 'r') as h5:
            tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'],
                                       dtype=np.int)

        with h5py.File(self.non_tag_annot_path, 'r') as h5:
            non_tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'],
                                           dtype=np.int)

        with h5py.File(self.tag_feature_path, 'r') as h5:
            tag_boxes = np.array(h5[str(anno_id)]['boxes'], dtype=np.float32)
            tag_features = np.zeros([4, tag_boxes.shape[0], 1024])
            for m in range(4):
                tag_features[m, :, :] = np.array(h5[str(anno_id)]['features' +
                                                                  str(m)],
                                                 dtype=np.float32)

        with h5py.File(self.non_tag_feature_path, 'r') as h5:
            non_tag_boxes = np.array(h5[str(anno_id)]['boxes'],
                                     dtype=np.float32)
            non_tag_features = np.zeros([4, non_tag_boxes.shape[0], 1024])
            for m in range(4):
                non_tag_features[m, :, :] = np.array(
                    h5[str(anno_id)]['features' + str(m)], dtype=np.float32)

        ###################################################################
        # Load questions and answers

        non_tag_question_annotid2detidx = self.non_tag_question_annotid2detidx[
            item['annot_id']]
        non_tag_answer_annotid2detidx = self.non_tag_answer_annotid2detidx[
            item['annot_id']]
        non_tag_rationale_annotid2detidx = self.non_tag_rationale_annotid2detidx[
            item['annot_id']]

        if self.mode == 'answer':
            question_annotid2detidx = non_tag_question_annotid2detidx
            answer_annotid2detidx = non_tag_answer_annotid2detidx
        else:
            conditioned_label = item[
                'answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            q_len = len(item['question'])
            question_annotid2detidx = {}
            for k, v in non_tag_question_annotid2detidx.items():
                question_annotid2detidx[k] = v
            for k, v in non_tag_answer_annotid2detidx[conditioned_label].items(
            ):
                question_annotid2detidx[k + q_len] = v
            answer_annotid2detidx = non_tag_rationale_annotid2detidx

        if self.mode == 'rationale':
            conditioned_label = item[
                'answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {
                k: np.array(v, dtype=np.float16)
                for k, v in h5[str(index)].items()
            }

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)
        non_tag_dets2use, non_tag_old_det_to_new_ind = self._get_non_tag_det_to_use(
            question_annotid2detidx, answer_annotid2detidx, len(non_tag_boxes))

        if self.add_image_as_a_box:
            assert (len(dets2use) == np.max(old_det_to_new_ind))

        if self.add_image_as_a_box:
            non_tag_old_det_to_new_ind += 1

        # shift the non_tag detection idx, effectively as appending the non_tag detections to tag detections
        non_tag_old_det_to_new_ind[np.where(non_tag_old_det_to_new_ind)
                                   [0]] += len(dets2use)

        old_det_to_new_ind = old_det_to_new_ind.tolist()
        non_tag_old_det_to_new_ind = non_tag_old_det_to_new_ind.tolist()
        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {
                k: np.array(v, dtype=np.float16)
                for k, v in h5[str(index)].items()
            }

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[
                _my_fix_tokenization(
                    item['question'],
                    grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                    old_det_to_new_ind,
                    item['objects'],
                    non_tag_old_det_to_new_ind,
                    question_annotid2detidx,
                    token_indexers=self.token_indexers,
                    pad_ind=0 if self.add_image_as_a_box else -1,
                ) for i in range(4)
            ])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[
            _my_fix_tokenization(
                answer,
                grp_items[f'answer_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                non_tag_old_det_to_new_ind,
                answer_annotid2detidx[i],
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1,
            ) for i, answer in enumerate(answer_choices)
        ])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(
                self.mode)],
                                                skip_indexing=True)
        instance_dict['metadata'] = MetadataField({
            'annot_id':
            item['annot_id'],
            'ind':
            index,
            'movie':
            item['movie'],
            'img_fn':
            item['img_fn'],
            'question_number':
            item['question_number']
        })

        ##node
        node_tokenized, node_tags = zip(*[
            _fix_word(i,
                      index,
                      item['annot_id'],
                      self.h5fn_graph,
                      self.h5fn_word,
                      pad_ind=0) for i in range(4)
        ])
        instance_dict['node'] = ListField(node_tokenized)

        ##visual concept
        visual_concept_tokenized, visual_concept_tags = zip(*[
            _fix_visual_concept(item['visual_concept'],
                                item['visual_concept_num'],
                                self.h5fn_word,
                                pad_ind=0) for i in range(4)
        ])
        instance_dict['visual_concept'] = ListField(visual_concept_tokenized)

        ##adj
        adj_result, adj_len = zip(*[
            _fix_adj(i, index, item['annot_id'], self.h5fn_graph, pad_ind=0)
            for i in range(4)
        ])
        instance_dict['adjacent'] = ListField(adj_result)

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        #image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        #image, window, img_scale, padding = resize_image(image, random_pad=self.is_train)
        #image = to_tensor_and_normalize(image)
        #c, h, w = image.shape
        ###################################################################

        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # Chop off the final dimension, that's the confidence
        if self.add_image_as_a_box:
            tag_boxes = np.row_stack(
                ([1, 1, 700, 700],
                 tag_boxes))  # here we just use dummy box for background
        non_tag_boxes = non_tag_boxes[non_tag_dets2use]
        boxes = np.concatenate((tag_boxes, non_tag_boxes))

        if self.add_image_as_a_box:
            dets2use = dets2use + 1
            dets2use = np.insert(dets2use, 0, 0)

        tag_det_features = np.zeros([4, len(dets2use), 1024])
        non_tag_det_features = np.zeros([4, len(non_tag_dets2use), 1024])
        for z in range(4):
            tag_det_features[z, :, :] = tag_features[z][dets2use]
            non_tag_det_features[
                z, :, :] = non_tag_features[z][non_tag_dets2use]
        det_features = np.concatenate((tag_det_features, non_tag_det_features),
                                      1)
        #print('here all features ',all_features.shape)
        instance_dict['det_features'] = ArrayField(det_features,
                                                   padding_value=0)

        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return None, instance
Exemple #26
0
    def test_as_tensor_returns_integer_tensor(self):
        label = LabelField(5, skip_indexing=True)

        tensor = label.as_tensor(label.get_padding_lengths())
        assert tensor.item() == 5
Exemple #27
0
 def text_to_instance(self, string: str, label: int) -> Instance:
     fields: Dict[str, Field] = {}
     tokens = self._tokenizer.tokenize(string)
     fields['tokens'] = TextField(tokens, self._token_indexers)
     fields['label'] = LabelField(label, skip_indexing=True)
     return Instance(fields)
Exemple #28
0
 def test_label_field_raises_with_incorrect_label_type(self):
     with pytest.raises(ConfigurationError):
         _ = LabelField([], skip_indexing=False)
Exemple #29
0
    def text_to_instance(
        self,  # type: ignore
        sentence: str,
        structured_representations: List[List[List[JsonDict]]],
        labels: List[str] = None,
        target_sequences: List[List[str]] = None,
        identifier: str = None,
    ) -> Instance:
        """
        Parameters
        ----------
        sentence : ``str``
            The query sentence.
        structured_representations : ``List[List[List[JsonDict]]]``
            A list of Json representations of all the worlds. See expected format in this class' docstring.
        labels : ``List[str]`` (optional)
            List of string representations of the labels (true or false) corresponding to the
            ``structured_representations``. Not required while testing.
        target_sequences : ``List[List[str]]`` (optional)
            List of target action sequences for each element which lead to the correct denotation in
            worlds corresponding to the structured representations.
        identifier : ``str`` (optional)
            The identifier from the dataset if available.
        """
        worlds = []
        for structured_representation in structured_representations:
            boxes = {
                Box(object_list, box_id)
                for box_id, object_list in enumerate(structured_representation)
            }
            worlds.append(NlvrLanguage(boxes))
        tokenized_sentence = self._tokenizer.tokenize(sentence)
        sentence_field = TextField(tokenized_sentence, self._sentence_token_indexers)
        production_rule_fields: List[Field] = []
        instance_action_ids: Dict[str, int] = {}
        # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change
        # later.
        for production_rule in worlds[0].all_possible_productions():
            instance_action_ids[production_rule] = len(instance_action_ids)
            field = ProductionRuleField(production_rule, is_global_rule=True)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)
        worlds_field = ListField([MetadataField(world) for world in worlds])
        metadata: Dict[str, Any] = {"sentence_tokens": [x.text for x in tokenized_sentence]}
        fields: Dict[str, Field] = {
            "sentence": sentence_field,
            "worlds": worlds_field,
            "actions": action_field,
            "metadata": MetadataField(metadata),
        }
        if identifier is not None:
            fields["identifier"] = MetadataField(identifier)
        # Depending on the type of supervision used for training the parser, we may want either
        # target action sequences or an agenda in our instance. We check if target sequences are
        # provided, and include them if they are. If not, we'll get an agenda for the sentence, and
        # include that in the instance.
        if target_sequences:
            action_sequence_fields: List[Field] = []
            for target_sequence in target_sequences:
                index_fields = ListField(
                    [
                        IndexField(instance_action_ids[action], action_field)
                        for action in target_sequence
                    ]
                )
                action_sequence_fields.append(index_fields)
                # TODO(pradeep): Define a max length for this field.
            fields["target_action_sequences"] = ListField(action_sequence_fields)
        elif self._output_agendas:
            # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true
            # now, but may change later too.
            agenda = worlds[0].get_agenda_for_sentence(sentence)
            assert agenda, "No agenda found for sentence: %s" % sentence
            # agenda_field contains indices into actions.
            agenda_field = ListField(
                [IndexField(instance_action_ids[action], action_field) for action in agenda]
            )
            fields["agenda"] = agenda_field
        if labels:
            labels_field = ListField(
                [LabelField(label, label_namespace="denotations") for label in labels]
            )
            fields["labels"] = labels_field

        return Instance(fields)
Exemple #30
0
 def test_printing_doesnt_crash(self):
     label = LabelField("label", label_namespace="namespace")
     print(label)
def process_split(split, indexers, pair_input, categorical):
    '''
    Convert a dataset of sentences into padded sequences of indices.

    Args:
        - split (list[list[str]]): list of inputs (possibly pair) and outputs
        - pair_input (int)
        - tok2idx (dict)

    Returns:
    '''
    if pair_input:
        inputs1 = [
            TextField(list(map(Token, sent)), token_indexers=indexers)
            for sent in split[0]
        ]
        inputs2 = [
            TextField(list(map(Token, sent)), token_indexers=indexers)
            for sent in split[1]
        ]
        if categorical:
            labels = [
                LabelField(l, label_namespace="labels", skip_indexing=True)
                for l in split[2]
            ]
        else:
            labels = [NumericField(l) for l in split[-1]]

        if len(split) == 4:  # numbered test examples
            idxs = [
                LabelField(l, label_namespace="idxs", skip_indexing=True)
                for l in split[3]
            ]
            instances = [Instance({"input1": input1, "input2": input2, "label": label, "idx": idx})\
                          for (input1, input2, label, idx) in zip(inputs1, inputs2, labels, idxs)]

        else:
            instances = [Instance({"input1": input1, "input2": input2, "label": label}) for \
                            (input1, input2, label) in zip(inputs1, inputs2, labels)]

    else:
        inputs1 = [
            TextField(list(map(Token, sent)), token_indexers=indexers)
            for sent in split[0]
        ]
        if categorical:
            labels = [
                LabelField(l, label_namespace="labels", skip_indexing=True)
                for l in split[2]
            ]
        else:
            labels = [NumericField(l) for l in split[2]]

        if len(split) == 4:
            idxs = [
                LabelField(l, label_namespace="idxs", skip_indexing=True)
                for l in split[3]
            ]
            instances = [Instance({"input1": input1, "label": label, "idx": idx}) for \
                         (input1, label, idx) in zip(inputs1, labels, idxs)]
        else:
            instances = [
                Instance({
                    "input1": input1,
                    "label": label
                }) for (input1, label) in zip(inputs1, labels)
            ]
    return instances  #DatasetReader(instances) #Batch(instances) #Dataset(instances)
 def test_as_tensor_returns_integer_tensor(self):
     label = LabelField(5, skip_indexing=True)
     tensor = label.as_tensor(label.get_padding_lengths()).data.cpu().numpy()
     numpy.testing.assert_array_almost_equal(tensor, numpy.array([5]))
def make_reading_comprehension_instance(question_text: str,
                                        passage_text: str,
                                        answer_text: str,
                                        label: float,
                                        question_passage_tokens: List[Token],
                                        question_passage_offsets: List[Tuple[
                                            int, int]],
                                        token_indexers: Dict[str,
                                                             TokenIndexer],
                                        id: str = None,
                                        pred_chains: List[Tuple[List,
                                                                float]] = None,
                                        sp_facts_id: List[int] = None,
                                        article: Dict = None) -> Instance:
    """
    Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
    in a reading comprehension model.
    Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
    ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
    and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
    fields, which are both ``IndexFields``.
    Parameters
    ----------
    question_tokens : ``List[Token]``
        An already-tokenized question.
    question_passage_tokens : ``List[Token]``
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : ``Dict[str, TokenIndexer]``
        Determines how the question and passage ``TextFields`` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : ``str``
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_spans : ``List[Tuple[int, int]]``, optional
        Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
        a list because there might be several possible correct answer spans in the passage.
        Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple
        annotations on the dev set; this will select the span that the most annotators gave as
        correct).
    answer_texts : ``List[str]``, optional
        All valid answer strings for the given question.  In SQuAD, e.g., the training set has
        exactly one answer per question, but the dev and test sets have several.  TriviaQA has many
        possible answers, which are the aliases for the known correct entity.  This is put into the
        metadata for use with official evaluation scripts, but not used anywhere else.
    passage_dep_heads : ``List[int]``, optional
        The dependency parents for each token in the passage, zero-indexing.
    additional_metadata : ``Dict[str, Any]``, optional
        The constructed ``metadata`` field will by default contain ``original_passage``,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    para_limit : ``int``, indicates the maximum length of a given article
    """
    fields: Dict[str, Field] = {}
    # This is separate so we can reference it later with a known type.
    question_passage_field = TextField(question_passage_tokens, token_indexers)

    fields['question_passage'] = question_passage_field
    fields['label'] = LabelField(label, skip_indexing=True)

    metadata = {
        'original_passage': passage_text,
        'token_offsets': question_passage_offsets,
        'question_text': question_text,
        'original_label_score': label,
        'id': id,
        'pred_chains': pred_chains,
        'sp_set_id': sp_facts_id,
        'original_article': article,
        'passage_tokens': [token.text for token in question_passage_tokens]
    }

    fields['metadata'] = MetadataField(metadata)
    return Instance(fields)
 def test_label_field_empty_field_works(self):
     label = LabelField("test")
     empty_label = label.empty_field()
     assert empty_label.label == -1
Exemple #35
0
    def test_as_tensor_returns_integer_tensor(self):
        label = LabelField(5, skip_indexing=True)

        tensor = label.as_tensor(label.get_padding_lengths())
        assert tensor.item() == 5