Python Instance.Instance Examples, allennlp.data.instance.Instance.Instance Python Examples

Example #1

0

Show file

def make_reading_comprehension_instance_quac(
        question_list_tokens: List[List[Token]],
        passage_tokens: List[Token],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        token_span_lists: List[List[Tuple[int, int]]] = None,
        yesno_list: List[int] = None,
        followup_list: List[int] = None,
        additional_metadata: Dict[str, Any] = None,
        num_context_answers: int = 0) -> Instance:
    """
    Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
    in a reading comprehension model.

    Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
    ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
    and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
    fields, which are both ``IndexFields``.

    Parameters
    ----------
    question_list_tokens : ``List[List[Token]]``
        An already-tokenized list of questions. Each dialog have multiple questions.
    passage_tokens : ``List[Token]``
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : ``Dict[str, TokenIndexer]``
        Determines how the question and passage ``TextFields`` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : ``str``
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_spans_lists : ``List[List[Tuple[int, int]]]``, optional
        Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
        a list of list, first because there is multiple questions per dialog, and
        because there might be several possible correct answer spans in the passage.
        Currently, we just select the last span in this list (i.e., QuAC has multiple
        annotations on the dev set; this will select the last span, which was given by the original annotator).
    yesno_list : ``List[int]``
        List of the affirmation bit for each question answer pairs.
    followup_list : ``List[int]``
        List of the continuation bit for each question answer pairs.
    num_context_answers : ``int``, optional
        How many answers to encode into the passage.
    additional_metadata : ``Dict[str, Any]``, optional
        The constructed ``metadata`` field will by default contain ``original_passage``,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    """
    additional_metadata = additional_metadata or {}
    fields: Dict[str, Field] = {}
    passage_offsets = [(token.idx, token.idx + len(token.text))
                       for token in passage_tokens]
    # This is separate so we can reference it later with a known type.
    passage_field = TextField(passage_tokens, token_indexers)
    fields['passage'] = passage_field
    fields['question'] = ListField([
        TextField(q_tokens, token_indexers)
        for q_tokens in question_list_tokens
    ])
    metadata = {'original_passage': passage_text,
                'token_offsets': passage_offsets,
                'question_tokens': [[token.text for token in question_tokens] \
                                    for question_tokens in question_list_tokens],
                'passage_tokens': [token.text for token in passage_tokens], }
    p1_answer_marker_list: List[Field] = []
    p2_answer_marker_list: List[Field] = []
    p3_answer_marker_list: List[Field] = []

    def get_tag(i, i_name):
        # Generate a tag to mark previous answer span in the passage.
        return "<{0:d}_{1:s}>".format(i, i_name)

    def mark_tag(span_start, span_end, passage_tags, prev_answer_distance):
        try:
            assert span_start > 0
            assert span_end > 0
        except:
            raise ValueError(
                "Previous {0:d}th answer span should have been updated!".
                format(prev_answer_distance))
        # Modify "tags" to mark previous answer span.
        if span_start == span_end:
            passage_tags[prev_answer_distance][span_start] = get_tag(
                prev_answer_distance, "")
        else:
            passage_tags[prev_answer_distance][span_start] = get_tag(
                prev_answer_distance, "start")
            passage_tags[prev_answer_distance][span_end] = get_tag(
                prev_answer_distance, "end")
            for passage_index in range(span_start + 1, span_end):
                passage_tags[prev_answer_distance][passage_index] = get_tag(
                    prev_answer_distance, "in")

    if token_span_lists:
        span_start_list: List[Field] = []
        span_end_list: List[Field] = []
        p1_span_start, p1_span_end, p2_span_start = -1, -1, -1
        p2_span_end, p3_span_start, p3_span_end = -1, -1, -1
        # Looping each <<answers>>.
        for question_index, answer_span_lists in enumerate(token_span_lists):
            span_start, span_end = answer_span_lists[
                -1]  # Last one is the original answer
            span_start_list.append(IndexField(span_start, passage_field))
            span_end_list.append(IndexField(span_end, passage_field))
            prev_answer_marker_lists = [["O"] * len(passage_tokens),
                                        ["O"] * len(passage_tokens),
                                        ["O"] * len(passage_tokens),
                                        ["O"] * len(passage_tokens)]
            if question_index > 0 and num_context_answers > 0:
                mark_tag(p1_span_start, p1_span_end, prev_answer_marker_lists,
                         1)
                if question_index > 1 and num_context_answers > 1:
                    mark_tag(p2_span_start, p2_span_end,
                             prev_answer_marker_lists, 2)
                    if question_index > 2 and num_context_answers > 2:
                        mark_tag(p3_span_start, p3_span_end,
                                 prev_answer_marker_lists, 3)
                    p3_span_start = p2_span_start
                    p3_span_end = p2_span_end
                p2_span_start = p1_span_start
                p2_span_end = p1_span_end
            p1_span_start = span_start
            p1_span_end = span_end
            if num_context_answers > 2:
                p3_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[3],
                                       passage_field,
                                       label_namespace="answer_tags"))
            if num_context_answers > 1:
                p2_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[2],
                                       passage_field,
                                       label_namespace="answer_tags"))
            if num_context_answers > 0:
                p1_answer_marker_list.append(
                    SequenceLabelField(prev_answer_marker_lists[1],
                                       passage_field,
                                       label_namespace="answer_tags"))
        fields['span_start'] = ListField(span_start_list)
        fields['span_end'] = ListField(span_end_list)
        if num_context_answers > 0:
            fields['p1_answer_marker'] = ListField(p1_answer_marker_list)
            if num_context_answers > 1:
                fields['p2_answer_marker'] = ListField(p2_answer_marker_list)
                if num_context_answers > 2:
                    fields['p3_answer_marker'] = ListField(
                        p3_answer_marker_list)
        fields['yesno_list'] = ListField( \
            [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list])
        fields['followup_list'] = ListField([LabelField(followup, label_namespace="followup_labels") \
                                             for followup in followup_list])
    metadata.update(additional_metadata)
    fields['metadata'] = MetadataField(metadata)
    return Instance(fields)

Example #2

0

Show file

File: narrativeqa_summary_with_questions.py Project: tbmihailov/discourse-aware-semantic-self-attention

    def make_reading_comprehension_instance(
            self,
            question_tokens: List[Token],
            passage_tokens: List[Token],
            token_indexers: Dict[str, TokenIndexer],
            passage_text: str,
            token_spans: List[Tuple[int, int]] = None,
            answer_texts: List[str] = None,
            additional_metadata: Dict[str, Any] = None) -> Instance:
        """
        Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
        in a reading comprehension model.

        Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
        ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
        and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
        fields, which are both ``IndexFields``.

        Parameters
        ----------
        question_tokens : ``List[Token]``
            An already-tokenized question.
        passage_tokens : ``List[Token]``
            An already-tokenized passage that contains the answer to the given question.
        token_indexers : ``Dict[str, TokenIndexer]``
            Determines how the question and passage ``TextFields`` will be converted into tensors that
            get input to a model.  See :class:`TokenIndexer`.
        passage_text : ``str``
            The original passage text.  We need this so that we can recover the actual span from the
            original passage that the model predicts as the answer to the question.  This is used in
            official evaluation scripts.
        token_spans : ``List[Tuple[int, int]]``, optional
            Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
            a list because there might be several possible correct answer spans in the passage.
            Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple
            annotations on the dev set; this will select the span that the most annotators gave as
            correct).
        answer_texts : ``List[str]``, optional
            All valid answer strings for the given question.  In SQuAD, e.g., the training set has
            exactly one answer per question, but the dev and test sets have several.  TriviaQA has many
            possible answers, which are the aliases for the known correct entity.  This is put into the
            metadata for use with official evaluation scripts, but not used anywhere else.
        additional_metadata : ``Dict[str, Any]``, optional
            The constructed ``metadata`` field will by default contain ``original_passage``,
            ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
            you want any other metadata to be associated with each instance, you can pass that in here.
            This dictionary will get added to the ``metadata`` dictionary we already construct.
        """
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}

        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        fields['passage'] = passage_field
        fields['question'] = TextField(question_tokens, token_indexers)
        metadata = {
            'original_passage': passage_text,
            'question_tokens': [token.text for token in question_tokens],
            'passage_tokens': [token.text for token in passage_tokens],
        }

        if answer_texts:
            metadata['answer_texts'] = answer_texts

        if token_spans:
            metadata["token_spans"] = token_spans

            # assume spans are sorted by some criteria
            span_start = token_spans[0][0]
            span_end = token_spans[0][1] - 1
            assert (span_start <= span_end)
            if span_end > len(passage_tokens) - 1:
                return None

            fields['span_start'] = IndexField(span_start, passage_field)
            fields['span_end'] = IndexField(span_end, passage_field)

        metadata.update(additional_metadata)
        fields['metadata'] = MetadataField(metadata)

        return Instance(fields)

Example #3

0

Show file

    def text_to_instance(self, line: str) -> Instance:  # type: ignore

        tokens = self._tokenizer.tokenize(line)
        return Instance({"line": TextField(tokens, self._token_indexers)})

Example #4

0

Show file

File: ie_json.py Project: ulmewennberg/dygiepp

    def text_to_instance(self, sentence: List[str],
                         ner_dict: Dict[Tuple[int, int], str], relation_dict,
                         cluster_dict, trigger_dict, argument_dict,
                         doc_key: str, dataset: str, sentence_num: int,
                         groups: List[str], start_ix: int, end_ix: int):
        """
        TODO(dwadden) document me.
        """

        sentence = [self._normalize_word(word) for word in sentence]

        text_field = TextField([Token(word) for word in sentence],
                               self._token_indexers)
        text_field_with_context = TextField([Token(word) for word in groups],
                                            self._token_indexers)

        # Put together the metadata.
        metadata = dict(sentence=sentence,
                        ner_dict=ner_dict,
                        relation_dict=relation_dict,
                        cluster_dict=cluster_dict,
                        trigger_dict=trigger_dict,
                        argument_dict=argument_dict,
                        doc_key=doc_key,
                        dataset=dataset,
                        groups=groups,
                        start_ix=start_ix,
                        end_ix=end_ix,
                        sentence_num=sentence_num)
        metadata_field = MetadataField(metadata)

        # Trigger labels. One label per token in the input.
        token_trigger_labels = []
        for i in range(len(text_field)):
            token_trigger_labels.append(trigger_dict[i])

        trigger_label_field = SequenceLabelField(
            token_trigger_labels, text_field, label_namespace="trigger_labels")

        # Generate fields for text spans, ner labels, coref labels.
        spans = []
        span_ner_labels = []
        span_coref_labels = []
        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            span_ix = (start, end)
            span_ner_labels.append(ner_dict[span_ix])
            span_coref_labels.append(cluster_dict[span_ix])
            spans.append(SpanField(start, end, text_field))

        span_field = ListField(spans)
        ner_label_field = SequenceLabelField(span_ner_labels,
                                             span_field,
                                             label_namespace="ner_labels")
        coref_label_field = SequenceLabelField(span_coref_labels,
                                               span_field,
                                               label_namespace="coref_labels")

        # Generate labels for relations and arguments. Only store non-null values.
        # For the arguments, by convention the first span specifies the trigger, and the second
        # specifies the argument. Ideally we'd have an adjacency field between (token, span) pairs
        # for the event arguments field, but AllenNLP doesn't make it possible to express
        # adjacencies between two different sequences.
        n_spans = len(spans)
        span_tuples = [(span.span_start, span.span_end) for span in spans]
        candidate_indices = [(i, j) for i in range(n_spans)
                             for j in range(n_spans)]

        relations = []
        relation_indices = []
        for i, j in candidate_indices:
            span_pair = (span_tuples[i], span_tuples[j])
            relation_label = relation_dict[span_pair]
            if relation_label:
                relation_indices.append((i, j))
                relations.append(relation_label)

        relation_label_field = AdjacencyField(
            indices=relation_indices,
            sequence_field=span_field,
            labels=relations,
            label_namespace="relation_labels")

        arguments = []
        argument_indices = []
        n_tokens = len(sentence)
        candidate_indices = [(i, j) for i in range(n_tokens)
                             for j in range(n_spans)]
        for i, j in candidate_indices:
            token_span_pair = (i, span_tuples[j])
            argument_label = argument_dict[token_span_pair]
            if argument_label:
                argument_indices.append((i, j))
                arguments.append(argument_label)

        argument_label_field = AdjacencyFieldAssym(
            indices=argument_indices,
            row_field=text_field,
            col_field=span_field,
            labels=arguments,
            label_namespace="argument_labels")

        # Pull it  all together.
        fields = dict(text=text_field_with_context,
                      spans=span_field,
                      ner_labels=ner_label_field,
                      coref_labels=coref_label_field,
                      trigger_labels=trigger_label_field,
                      argument_labels=argument_label_field,
                      relation_labels=relation_label_field,
                      metadata=metadata_field)

        return Instance(fields)

Example #5

0

Show file

def make_reading_comprehension_instance(
        question_tokens: List[Token],
        passage_tokens: List[Token],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        token_spans_sent: List[Tuple[int, int]] = None,
        sent_labels: List[int] = None,
        answer_texts: List[str] = None,
        passage_offsets: List[Tuple] = None,
        evd_possible_chains: List[List[int]] = None,
        ans_sent_idxs: List[int] = None,
        article_id: str = None,
        para_limit: int = 2250) -> Instance:
    """
    Parameters
    ----------
    question_tokens : ``List[Token]``
        An already-tokenized question.
    passage_tokens : ``List[Token]``
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : ``Dict[str, TokenIndexer]``
        Determines how the question and passage ``TextFields`` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : ``str``
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_spans : ``List[Tuple[int, int]]``, optional
        Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
        a list because there might be several possible correct answer spans in the passage.
        Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple
        annotations on the dev set; this will select the span that the most annotators gave as
        correct).
    answer_texts : ``List[str]``, optional
        All valid answer strings for the given question.  In SQuAD, e.g., the training set has
        exactly one answer per question, but the dev and test sets have several.  TriviaQA has many
        possible answers, which are the aliases for the known correct entity.  This is put into the
        metadata for use with official evaluation scripts, but not used anywhere else.
    additional_metadata : ``Dict[str, Any]``, optional
        The constructed ``metadata`` field will by default contain ``original_passage``,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    para_limit : ``int``, indicates the maximum length of a given article
    """
    fields: Dict[str, Field] = {}
    limit = len(
        passage_tokens) if para_limit > len(passage_tokens) else para_limit
    passage_tokens = passage_tokens[:limit]
    # This is separate so we can reference it later with a known type.
    passage_field = TextField(passage_tokens, token_indexers)
    # sent_spans: list of [SpanFiled[sent_start, sent_end]], denote the start and end offset for each sentence
    # sent_labels_: list of [label], denote the whether a sentence is a supporting fact
    sent_spans, sent_labels_ = process_sent_spans(token_spans_sent,
                                                  sent_labels, passage_field,
                                                  para_limit)
    fields['sent_labels'] = ListField(sent_labels_)
    fields['sentence_spans'] = ListField(sent_spans)
    fields['passage'] = passage_field
    fields['question'] = TextField(question_tokens, token_indexers)

    # filter spans that exceed para limit so that the info in metadata is correct
    token_spans_sent = [(s, e if e < limit else limit - 1)
                        for s, e in token_spans_sent if s < limit]
    sent_labels = sent_labels[:len(token_spans_sent)]
    evd_possible_chains_ = process_evidence_chains(evd_possible_chains,
                                                   sent_labels_, fields)

    metadata = make_meta_data(passage_text, passage_offsets, question_tokens,
                              passage_tokens, token_spans_sent, sent_labels,
                              answer_texts, evd_possible_chains,
                              evd_possible_chains_, ans_sent_idxs, article_id)
    fields['metadata'] = MetadataField(metadata)
    return Instance(fields)

Example #6

0

Show file

File: vcr_attribute_new_tag.py Project: jaeyun95/VLKG_VCR

    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])
        image_id = int(item['img_id'].split('-')[-1])
    

        with h5py.File(self.tag_feature_path, 'r') as h5:
            tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32)
            tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32)
            tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int)

        with h5py.File(self.non_tag_feature_path, 'r') as h5:
            non_tag_boxes = np.array(h5[str(image_id)]['boxes'], dtype=np.float32)
            non_tag_obj_indices = np.array(h5[str(image_id)]['obj_indices'], dtype=np.int)
            non_tag_features = np.array(h5[str(image_id)]['features'], dtype=np.float32)
        ###################################################################
        # Load questions and answers

        non_tag_question_annotid2detidx = self.non_tag_question_annotid2detidx[item['annot_id']]
        non_tag_answer_annotid2detidx = self.non_tag_answer_annotid2detidx[item['annot_id']]
        non_tag_rationale_annotid2detidx = self.non_tag_rationale_annotid2detidx[item['annot_id']]
        
        if self.mode == 'answer':
            question_annotid2detidx =  non_tag_question_annotid2detidx
            answer_annotid2detidx = non_tag_answer_annotid2detidx
        else:
            conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            q_len = len(item['question'])
            question_annotid2detidx = {}
            for k,v in non_tag_question_annotid2detidx.items():
                question_annotid2detidx[k] = v
            for k,v in non_tag_answer_annotid2detidx[conditioned_label].items():
                question_annotid2detidx[k+q_len] = v
            answer_annotid2detidx = non_tag_rationale_annotid2detidx

        if self.mode == 'rationale':
            conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)
        non_tag_dets2use, non_tag_old_det_to_new_ind = self._get_non_tag_det_to_use(question_annotid2detidx, answer_annotid2detidx, len(non_tag_boxes))

        if self.add_image_as_a_box:
            assert (len(dets2use) == np.max(old_det_to_new_ind))

        if self.add_image_as_a_box:
            non_tag_old_det_to_new_ind += 1

        # shift the non_tag detection idx, effectively as appending the non_tag detections to tag detections
        non_tag_old_det_to_new_ind[np.where(non_tag_old_det_to_new_ind)[0]] += len(dets2use)

        old_det_to_new_ind = old_det_to_new_ind.tolist()
        non_tag_old_det_to_new_ind = non_tag_old_det_to_new_ind.tolist()
        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[_my_fix_tokenization(
                item['question'],
                grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                non_tag_old_det_to_new_ind,
                question_annotid2detidx,
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1,
            ) for i in range(4)])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[_my_fix_tokenization(
            answer,
            grp_items[f'answer_{self.mode}{condition_key}{i}'],
            old_det_to_new_ind,
            item['objects'],
            non_tag_old_det_to_new_ind,
            answer_annotid2detidx[i],
            token_indexers=self.token_indexers,
            pad_ind=0 if self.add_image_as_a_box else -1,
        ) for i, answer in enumerate(answer_choices)])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True)
        instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'],
                                                   'img_fn': item['img_fn'],
                                                   'question_number': item['question_number'],
                                                   'img_id':item['img_id']})

        ##node
        node_tokenized, node_tags = zip(*[_fix_word(
            i,
            index,
            item['annot_id'],
            self.h5fn_graph,
            self.h5fn_word,
            pad_ind=0
        ) for i in range(4)])
        instance_dict['node'] = ListField(node_tokenized)

        ##visual concept
        visual_concept_tokenized, visual_concept_tags = zip(*[_fix_visual_concept(
            item['visual_concept'],
            item['visual_concept_num'],
            self.h5fn_word,
            pad_ind=0
        ) for i in range(4)])
        instance_dict['visual_concept'] = ListField(visual_concept_tokenized)

        ##adj
        adj_result, adj_len = zip(*[_fix_adj(
            i,
            index,
            item['annot_id'],
            self.h5fn_graph,
            pad_ind=0
        ) for i in range(4)])
        instance_dict['adjacent'] = ListField(adj_result)

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        #image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        #image, window, img_scale, padding = resize_image(image, random_pad=self.is_train)
        #image = to_tensor_and_normalize(image)
        #c, h, w = image.shape
        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # Chop off the final dimension, that's the confidence
        tag_boxes = np.array(metadata['boxes'])[dets2use, :-1]
        if self.add_image_as_a_box:
            tag_boxes = np.row_stack(([1,1,700,700], tag_boxes)) # here we just use dummy box for background
        non_tag_boxes = non_tag_boxes[non_tag_dets2use]
        boxes = np.concatenate((tag_boxes, non_tag_boxes))

        if self.add_image_as_a_box:
            dets2use = dets2use + 1
            dets2use = np.insert(dets2use, 0, 0)

        tag_det_features = tag_features[dets2use]
        non_tag_det_features = non_tag_features[non_tag_dets2use]
        det_features = np.concatenate((tag_det_features, non_tag_det_features))

        instance_dict['det_features'] = ArrayField(det_features, padding_value=0)
        assert (det_features.shape[0] == boxes.shape[0])

        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return None, instance

Example #7

0

Show file

    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            predicate_indices: List[int],
            token_representations: FloatTensor = None,
            labels: List[float] = None):
        """
        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in the sentence to be encoded.
       predicate_indices: ``List[int]``, required.
            A List of int, where each item denotes the index of a
            token to predict a value for.
        token_representations: ``FloatTensor``, optional (default=``None``)
            Precomputed token representations to use in the instance. If ``None``,
            we use a ``Contextualizer`` provided to the dataset reader to calculate
            the token representations. Shape is (seq_len, representation_dim).
        labels: ``List[str]``, optional (default=``None``)
            The labels of the arcs. ``None`` indicates that labels are not
            provided.

        Returns
        -------
        An ``Instance`` containing the following fields:
            raw_tokens : ListField[MetadataField]
                The raw str tokens in the sequence. Each MetadataField stores the raw string
                of a single token.
            label_indices : ``SequenceArrayField``
                Array of shape (num_labels,) corresponding to the indices of tokens
                to predict a value for.
            token_representations: ``ArrayField``
                Contains the representation of the tokens.
            labels: ``SequenceArrayField``
                The labels corresponding each arc represented in token_indices.
        """
        fields: Dict[str, Field] = {}

        # Add raw_tokens to the field
        if self._include_raw_tokens:
            fields["raw_tokens"] = ListField(
                [MetadataField(token) for token in tokens])

        # Add label_indices to the field
        label_indices_field = SequenceArrayField(
            # Subtract 1 since original data is 1-indexed
            # Pad with -1 since 0 (usually mask token) is a valid label index
            np.array(predicate_indices, dtype="int64") - 1,
            padding_value=-1)
        fields["label_indices"] = label_indices_field

        if token_representations is None and self._contextualizer:
            # Contextualize the tokens
            token_representations = self._contextualizer([tokens])[0]

        # Add representations of the tokens at the arc indices to the field
        # If we don't have representations, use an empty numpy array.
        if token_representations is not None:
            fields["token_representations"] = ArrayField(
                token_representations.numpy())
        if labels:
            fields["labels"] = SequenceArrayField(
                np.array(labels, dtype="float32"))
        return Instance(fields)

Example #8

0

Show file

    def text_to_instance(self, graph, do_print=False) -> Instance:
        """
        Does bulk of work converting a graph to an Instance of Fields 
        """
        # pylint: disable=arguments-differ

        fields: Dict[str, Field] = {}

        max_tgt_length = None if self.eval else 60
        d = DecompGraph(graph, drop_syntax=self.drop_syntax, order=self.order)
        list_data = d.get_list_data(bos=START_SYMBOL,
                                    eos=END_SYMBOL,
                                    bert_tokenizer=self._tokenizer,
                                    max_tgt_length=max_tgt_length,
                                    semantics_only=self.semantics_only)
        if list_data is None:
            return None

        if do_print:
            self.spot_check(graph, list_data)

        # These four fields are used for seq2seq model and target side self copy
        fields["source_tokens"] = TextField(
            tokens=[Token(x) for x in list_data["src_tokens"]],
            token_indexers=self._source_token_indexers)

        if list_data['src_token_ids'] is not None:
            fields['source_subtoken_ids'] = ArrayField(
                list_data['src_token_ids'])
            self._number_bert_ids += len(list_data['src_token_ids'])
            self._number_bert_oov_ids += len([
                bert_id for bert_id in list_data['src_token_ids']
                if bert_id == 100
            ])

        if list_data['src_token_subword_index'] is not None:
            fields['source_token_recovery_matrix'] = ArrayField(
                list_data['src_token_subword_index'])

        # Target-side input.
        # (exclude the last one <EOS>.)
        fields["target_tokens"] = TextField(
            tokens=[Token(x) for x in list_data["tgt_tokens"][:-1]],
            token_indexers=self._target_token_indexers)

        if len(list_data['tgt_tokens']) > 60:
            self.over_len += 1

        fields["source_pos_tags"] = SequenceLabelField(
            labels=list_data["src_pos_tags"],
            sequence_field=fields["source_tokens"],
            label_namespace="pos_tags")

        if list_data["tgt_pos_tags"] is not None:
            fields["target_pos_tags"] = SequenceLabelField(
                labels=list_data["tgt_pos_tags"][:-1],
                sequence_field=fields["target_tokens"],
                label_namespace="pos_tags")

        fields["target_node_indices"] = SequenceLabelField(
            labels=list_data["tgt_indices"][:-1],
            sequence_field=fields["target_tokens"],
            label_namespace="node_indices",
        )

        # Target-side output.
        # Include <BOS> here because we want it in the generation vocabulary such that
        # at the inference starting stage, <BOS> can be correctly initialized.
        fields["generation_outputs"] = TextField(
            tokens=[Token(x) for x in list_data["tgt_tokens_to_generate"]],
            token_indexers=self._generation_token_indexers)

        fields["target_copy_indices"] = SequenceLabelField(
            labels=list_data["tgt_copy_indices"],
            sequence_field=fields["generation_outputs"],
            label_namespace="target_copy_indices",
        )

        fields[
            "target_attention_map"] = AdjacencyField(  # TODO: replace it with ArrayField.
                indices=list_data["tgt_copy_map"],
                sequence_field=fields["generation_outputs"],
                padding_value=0)

        # These two fields for source copy

        fields["source_copy_indices"] = SequenceLabelField(
            labels=list_data["src_copy_indices"],
            sequence_field=fields["generation_outputs"],
            label_namespace="source_copy_indices",
        )

        fields[
            "source_attention_map"] = AdjacencyField(  # TODO: replace it with ArrayField.
                indices=list_data["src_copy_map"],
                sequence_field=TextField([
                    Token(x) for x in
                    list_data["src_copy_vocab"].get_special_tok_list() +
                    list_data["src_tokens"]
                ], None),
                padding_value=0)
        #print(list_data['src_copy_indices'])
        #print(list_data['src_copy_map'])

        #print(f'over textfield {[Token(x) for x in list_data["src_copy_vocab"].get_special_tok_list() + list_data["src_tokens"]]}')

        #print(fields["source_copy_indices"])
        #print(fields["source_attention_map"])
        #sys.exit()

        # These two fields are used in biaffine parser
        fields["edge_types"] = TextField(
            tokens=[Token(x) for x in list_data["head_tags"]],
            token_indexers=self._edge_type_indexers)

        fields["edge_heads"] = SequenceLabelField(
            labels=list_data["head_indices"],
            sequence_field=fields["edge_types"],
            label_namespace="edge_heads")

        if list_data.get('node_mask', None) is not None:
            # Valid nodes are 1; pads are 0.
            fields['valid_node_mask'] = ArrayField(list_data['node_mask'])

        if list_data.get('edge_mask', None) is not None:
            # A matrix of shape [num_nodes, num_nodes] where entry (i, j) is 1
            # if and only if (1) j < i and (2) j is not an antecedent of i.
            # TODO: try to remove the second constrain.
            fields['edge_head_mask'] = ArrayField(list_data['edge_mask'])

        # node attributes
        #print(f"tgt attr {len(list_data['tgt_attributes'])}")
        #print(list_data['tgt_attributes'])
        #print(f"target tokens {len(fields['target_tokens'])}")
        #print(fields['target_tokens'])

        fields["target_attributes"] = ContinuousLabelField(
            labels=list_data["tgt_attributes"][:-1],
            sequence_field=fields["target_tokens"],
            ontology=NODE_ONTOLOGY)

        # edge attributes
        fields["edge_attributes"] = ContinuousLabelField(
            labels=list_data["edge_attributes"][:-1],
            sequence_field=fields["target_tokens"],
            ontology=EDGE_ONTOLOGY)

        # this field is actually needed for scoring later
        fields["graph"] = MetadataField(list_data['arbor_graph'])

        # Metadata fields, good for debugging
        fields["src_tokens_str"] = MetadataField(list_data["src_tokens"])

        fields["tgt_tokens_str"] = MetadataField(
            list_data.get("tgt_tokens", []))

        fields["src_copy_vocab"] = MetadataField(list_data["src_copy_vocab"])

        fields["tag_lut"] = MetadataField(dict(pos=list_data["pos_tag_lut"]))

        fields["source_copy_invalid_ids"] = MetadataField(
            list_data['src_copy_invalid_ids'])

        fields["node_name_list"] = MetadataField(list_data['node_name_list'])
        fields["target_dynamic_vocab"] = MetadataField(dict())

        fields["instance_meta"] = MetadataField(
            dict(
                pos_tag_lut=list_data["pos_tag_lut"],
                source_dynamic_vocab=list_data["src_copy_vocab"],
                target_token_indexers=self._target_token_indexers,
            ))

        to_print_keys = ["target_attributes", "target_tokens"]
        to_print = {k: v for k, v in fields.items() if k in to_print_keys}

        return Instance(fields)

Example #9

0

Show file

File: bert_copynet.py Project: pombredanne/UrcaNet

    def text_to_instance(
            self,  # type: ignore
            rule_text: str,
            question: str,
            scenario: str,
            history: List[Dict[str, str]],
            utterance_id: str = None,
            tree_id: str = None,
            source_url: str = None,
            answer: str = None,
            evidence: List[Dict[str, str]] = None) -> Optional[Instance]:
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """

        # For CopyNet Model
        source_string = rule_text + ' [SEP]'
        target_string = answer

        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        # tokenized_source.append(Token(END_SYMBOL)) '[SEP]' acts as end symbol
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source[1:-1], self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]]
        }
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        # For Bert model
        passage_text = rule_text + ' [SEP]'
        question_text = question
        question_text += ' @ss@ ' + scenario
        question_text += ' @hs@ '
        for follow_up_qna in history:
            question_text += '@qs@ '
            question_text += follow_up_qna['follow_up_question'] + ' '
            question_text += follow_up_qna['follow_up_answer'] + ' '
        question_text += '@he@'
        bert_input = passage_text + ' ' + question_text

        bert_input_tokens = self._bert_tokenizer.tokenize(bert_input)
        bert_input_tokens.insert(0, Token(START_SYMBOL))
        fields_dict['bert_input'] = TextField(bert_input_tokens,
                                              self._bert_token_indexers)
        meta_fields['passage_tokens'] = self._bert_tokenizer.tokenize(
            passage_text)

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source) - 2]
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ) - 2:]
            fields_dict["target_token_ids"] = ArrayField(
                np.array(target_token_ids))

            action = 'More' if answer not in ['Yes', 'No', 'Irrelevant'
                                              ] else answer
            fields_dict['label'] = LabelField(action)
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        meta_fields['rule_text'] = rule_text
        meta_fields['question'] = question
        meta_fields['scenario'] = scenario
        meta_fields['history'] = history
        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)

Example #10

0

Show file

File: rationale_reader_cose.py Project: successar/Eraser-Benchmark-Baseline-Models

    def text_to_instance(
        self,
        annotation_id: str,
        documents: Dict[str, List[str]],
        rationales: Dict[str, List[Tuple[int, int]]],
        query: str,
        label: str = None,
    ) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        fields = {}

        tokens = []
        is_evidence = []

        document_to_span_map = {}
        document_to_span_map_whole = {}

        docwords = documents[list(documents.keys())[0]]
        query = query.split("[sep]")
        query = [x.strip() for x in query]

        for docid, docwords in documents.items():
            document_to_span_map_whole[docid] = (len(tokens),
                                                 len(tokens) + len(docwords))
            tokens += [Token(word) for word in docwords]
            document_to_span_map[docid] = (len(tokens) - len(docwords),
                                           len(tokens))

            tokens.append(Token("[SEP]"))

            rationale = [0] * len(docwords)
            if docid in rationales:
                for s, e in rationales[docid]:
                    for i in range(s, e):
                        rationale[i] = 1

            is_evidence += rationale + [1]

        always_keep_mask = [
            1 if t.text.upper() == "[SEP]" else 0 for t in tokens
        ]

        fields["document"] = TextField(tokens, self._token_indexers)
        fields["rationale"] = SequenceLabelField(
            is_evidence,
            sequence_field=fields["document"],
            label_namespace="evidence_labels")
        fields["kept_tokens"] = SequenceLabelField(
            always_keep_mask,
            sequence_field=fields["document"],
            label_namespace="kept_token_labels")

        metadata = {
            "annotation_id": annotation_id,
            "tokens": tokens,
            "document_to_span_map": document_to_span_map,
            "convert_tokens_to_instance": self.convert_tokens_to_instance,
            "document_to_span_map_whole": document_to_span_map_whole,
            "always_keep_mask": np.array(always_keep_mask)
        }

        fields["metadata"] = MetadataField(metadata)
        fields["label"] = MetadataField({
            k: v
            for k, v in zip(["A", "B", "C", "D", "E", "Label"], query +
                            [label])
        })

        return Instance(fields)

Example #11

0

Show file

File: srl.py Project: danieldeutsch/gcd

    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None,
            ner_tags: List[str] = None,
            target_verb_lemma: str = None,
            target_verb_position: int = None,
            verb_sense: str = None,
            legal_args: List[str] = None,
            verb_annotation: List[str] = None,
            parse: str = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        words = [x.text for x in tokens]
        instance_fields["metadata"] = MetadataField({
            "words": words,  # used in ai2's srl model
            "pos_tags": pos_tags,
            "chunk_tags": chunk_tags,
            "ner_tags": chunk_tags,
            "target_verb_lemma": target_verb_lemma,
            "target_verb_position": target_verb_position,
            "verb_annotation": verb_annotation,
            "verb_sense": verb_sense,
            "legal_args": legal_args,
            "verb": target_verb_lemma,  # used in ai2's srl model
            "parse": parse  # for constraints for the dev set srl
        })

        # This is the position of the gold verb predicate
        # We may or may not use it (the model might predict the predicate), but the reader always sends it.
        # instance_fields["verb_pos"] = IndexField(index=target_verb_position, sequence_field=sequence)

        # TODO Allennlp uses SequenceFeatureField for a indicator vector of the verb position (Find this)
        # instance_fields["verb_indicator"] = SequenceFeatureField(index=target_verb_position, sequence_field=sequence)

        verb_indicator = np.zeros(len(tokens))
        verb_indicator[target_verb_position] = 1.0
        instance_fields["verb_indicator"] = ArrayField(array=verb_indicator)

        # everyone follows the default IOB2 == BIO format here
        coded_srl = get_bio_from_spans(verb_annotation,
                                       year=self.year,
                                       core_args_only=self.core_args_only)
        coded_chunks = chunk_tags
        coded_ner = ner_tags

        if self.coding_scheme == "BIOUL":
            # coded_srl = get_bio_from_spans(verb_annotation)
            coded_chunks = to_bioul(chunk_tags,
                                    encoding=self._original_coding_scheme
                                    ) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme
                                 ) if ner_tags is not None else None

        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'srl' and coded_srl is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_srl, sequence, self.label_namespace)
        elif self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(
                pos_tags, sequence, self.label_namespace)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace)

        return Instance(instance_fields)

Example #12

0

Show file

 def text_to_instance(self, tokens: List[Token]) -> Instance:  # type: ignore
     """
     We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
     """
     # pylint: disable=arguments-differ
     return Instance({'tokens': TextField(tokens, token_indexers=self._token_indexers)})

Example #13

0

Show file

    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers
        if self.mode == 'rationale':
            conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)

        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()}

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[_fix_tokenization(
                item['question'],
                grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1
            ) for i in range(4)])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[_fix_tokenization(
            answer,
            grp_items[f'answer_{self.mode}{condition_key}{i}'],
            old_det_to_new_ind,
            item['objects'],
            token_indexers=self.token_indexers,
            pad_ind=0 if self.add_image_as_a_box else -1
        ) for i, answer in enumerate(answer_choices)])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True)
        instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'],
                                                   'img_fn': item['img_fn'],
                                                   'question_number': item['question_number']})

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        image, window, img_scale, padding = resize_image(image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i])
                          for i in dets2use])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return image, instance

Example #14

0

Show file

File: penn_tree_bank.py Project: loopylangur/allennlp

    def text_to_instance(
        self,  # type: ignore
        tokens: List[str],
        pos_tags: List[str] = None,
        gold_tree: Tree = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        # Parameters

        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        pos_tags : ``List[str]``, optional, (default = None).
            The POS tags for the words in the sentence.
        gold_tree : ``Tree``, optional (default = None).
            The gold parse tree to create span labels from.

        # Returns

        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            pos_tags : ``SequenceLabelField``
                The POS tags of the words in the sentence.
                Only returned if ``use_pos_tags`` is ``True``
            spans : ``ListField[SpanField]``
                A ListField containing all possible subspans of the
                sentence.
            span_labels : ``SequenceLabelField``, optional.
                The constituency tags for each of the possible spans, with
                respect to a gold parse tree. If a span is not contained
                within the tree, a span will have a ``NO-LABEL`` label.
            gold_tree : ``MetadataField(Tree)``
                The gold NLTK parse tree for use in evaluation.
        """

        if self._convert_parentheses:
            tokens = [PTB_PARENTHESES.get(token, token) for token in tokens]
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        pos_namespace = self._label_namespace_prefix + self._pos_label_namespace
        if self._use_pos_tags and pos_tags is not None:
            pos_tag_field = SequenceLabelField(pos_tags,
                                               text_field,
                                               label_namespace=pos_namespace)
            fields["pos_tags"] = pos_tag_field
        elif self._use_pos_tags:
            raise ConfigurationError(
                "use_pos_tags was set to True but no gold pos"
                " tags were passed to the dataset reader.")
        spans: List[Field] = []
        gold_labels = []

        if gold_tree is not None:
            gold_spans: Dict[Tuple[int, int], str] = {}
            self._get_gold_spans(gold_tree, 0, gold_spans)

        else:
            gold_spans = None
        for start, end in enumerate_spans(tokens):
            spans.append(SpanField(start, end, text_field))

            if gold_spans is not None:
                gold_labels.append(gold_spans.get((start, end), "NO-LABEL"))

        metadata = {"tokens": tokens}
        if gold_tree:
            metadata["gold_tree"] = gold_tree
        if self._use_pos_tags:
            metadata["pos_tags"] = pos_tags

        fields["metadata"] = MetadataField(metadata)

        span_list_field: ListField = ListField(spans)
        fields["spans"] = span_list_field
        if gold_tree is not None:
            fields["span_labels"] = SequenceLabelField(
                gold_labels,
                span_list_field,
                label_namespace=self._label_namespace_prefix + "labels",
            )
        return Instance(fields)

Example #15

0

Show file

    def text_to_instance(
            self,
            item: Dict,
            entity_map: Dict,
            literals: Set,
            logical_forms: List = None) -> Instance:  # type: ignore
        qid = MetadataField(item['qid'])
        if item['qid'] in [2102902009000
                           ]:  # will exceed maximum length constraint
            return None

        if not self._use_sparql:
            if 's_expression' in item:
                target_string = item['s_expression']
            else:
                target_string = None
        else:
            if 'sparql_query' in item:
                target_string = item['sparql_query']
            else:
                target_string = None
        item['question'] = item['question'].replace(self._delimiter, ' ')
        # if self._training:
        if self._use_constrained_vocab and len(entity_map) > 0:
            if not self._training:
                constrained_vocab = self._get_constrained_vocab(
                    entity_map, literals)
            else:
                logical_form = item[
                    's_expression'] if not self._use_sparql else item[
                        'sparql_query']
                domains = item['domains'] if not self._gq1 else None
                constrained_vocab = self._get_constrained_vocab(
                    entity_map,
                    literals,
                    s_expression=logical_form,
                    domains=domains)
        elif len(entity_map) == 0 and self._training:
            vocab = set()
            vocab.update(self._schema_constants)
            vocab = list(vocab)
            random.shuffle(vocab)
            vocab = set(vocab[:200])
            if not self._use_sparql:
                vocab.update(
                    [x for x in self._target_tokenizer(item['s_expression'])])
            else:
                vocab.update(
                    [x for x in self._target_tokenizer(item['sparql_query'])])

            constrained_vocab = list(vocab)
        else:
            vocab = set()
            vocab.update(self._schema_constants)
            for eid in entity_map:
                vocab.add(eid)

            for l in literals:
                vocab.add(l)

            constrained_vocab = list(vocab)

        # schema_constants = constrained_vocab[:]
        # always fix the position of END_SYMBOL, START_SYMBOL in each constrained vocab,
        # because a consistent global shared end_index / start_index is needed by BeamSearch
        # Here we also fix the position for all other syntactic constants for the convenience
        # of embeddings computing
        for k, v in {
                k: v
                for k, v in sorted(self._global_syntax_constants_vocab.items(),
                                   key=lambda x: x[1])
        }.items():
            constrained_vocab.insert(v, k)

        schema_constants = constrained_vocab[:]

        # dividing the schema constants into num_constants_per_group every group
        concat_strings = [
            '' for _ in range(
                len(schema_constants) // self._num_constants_per_group + 1)
        ]
        for i in range(
                len(schema_constants) // self._num_constants_per_group + 1):
            if (i + 1) * self._num_constants_per_group <= len(
                    schema_constants):
                right_index = (i + 1) * self._num_constants_per_group
            else:
                right_index = len(schema_constants)
            for constant in schema_constants[
                    i * self._num_constants_per_group:right_index]:
                if constant in entity_map:  # to get the representation for a entity based on its friendly name
                    constant = entity_map[constant]
                if constant == '.':  # '.' in sparql means and
                    constant = 'and'
                concat_strings[i] += ' '.join(
                    re.split('\.|_', constant.lower())) + self._delimiter
        # handle sequence of length > 512 (dividing the schema constants into num_constants_per_group every group)
        # _source_tokenizer.tokenize will append the head [CLS] and ending [SEP] by itself
        tokenized_sources = [
            self._source_tokenizer.tokenize(item['question'] + '[SEP]' +
                                            concat_string)
            for concat_string in concat_strings
        ]

        end = []
        start = []
        for tokenized_source in tokenized_sources:
            flag = False
            for i, token in enumerate(tokenized_source):
                if flag and str(token) == self._delimiter:
                    end.append(i - 1)
                    start.append(i + 1)
                if str(token) == '[SEP]':
                    if not flag:
                        start.append(i + 1)
                    flag = True

            start = start[:-1]  # ignore the last ';'

        # unit test for concatenation
        # print(len(constrained_vocab), constrained_vocab)
        # for i, tokenized_source in enumerate(tokenized_sources):
        #     print(constrained_vocab[14 + 50*i: 14 + min(50*(i + 1), len(start))])
        #     print(start[50*i:min(50*(i + 1), len(start))])
        #     print(end[50*i:min(50*(i + 1), len(start))])
        #     print(tokenized_source)

        # source_field = ListField(
        # [TextField(tokenized_source, self._source_token_indexers) for tokenized_source in tokenized_sources])

        source_field = []
        for tokenized_source in tokenized_sources:
            chunk = TextField(tokenized_source, self._source_token_indexers)
            if len(chunk) > self._source_max_tokens:
                print(len(chunk), item['qid'], '!!!!!!!!!')
                exit(-1)
            source_field.append(chunk)
        source_field = ListField(source_field)

        # vocab_field = TextField([Token(x) for x in constrained_vocab], self._target_token_indexers)
        vocab_field = MetadataField(constrained_vocab)
        # if len(constrained_vocab) != 14 + len(start):
        if len(constrained_vocab) != len(start):
            print(entity_map)
        # assert len(constrained_vocab) == 14 + len(start)
        assert len(constrained_vocab) == len(start)

        instance_dict = {
            "source_tokens":
            source_field,  # The concatenation of utterance and schema constants
            # The start position for each schema constant in the concatenated input.
            "schema_start": MetadataField(start),
            # The end position ...
            "schema_end": MetadataField(end),
            "constrained_vocab": vocab_field,
            "ids": qid
        }

        # If you want to use F1 during training, uncomment this!
        # if 'answer' in item:
        #     answer = []
        #     for a in item['answer']:
        #         answer.append(a['answer_argument'])
        #     instance_dict['answer'] = MetadataField(answer)

        # print("num lfs: ", len(logical_forms))
        if not self._training and self._ranking_mode and logical_forms:
            lfs = []
            for lf in logical_forms:
                try:
                    lf_field = self._convert_target_to_indices(
                        lf, constrained_vocab, vocab_field)
                    lfs.append(lf_field)
                except Exception:
                    pass
            if len(lfs) == 0:
                return None
            candidates = ListField(lfs)
            instance_dict["candidates"] = candidates

            print(len(candidates))

        if target_string is not None:
            target_field = self._convert_target_to_indices(
                target_string, constrained_vocab, vocab_field)
            instance_dict[
                "target_tokens"] = target_field  # The id of each target token in constrained_vocab

        return Instance(instance_dict)

Example #16

0

Show file

def make_reading_comprehension_instance(
        question_tokens: List[Token],
        passage_tokens: List[Token],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        token_spans: List[Tuple[int, int]] = None,
        answer_texts: List[str] = None,
        additional_metadata: Dict[str, Any] = None) -> Instance:
    """
    Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
    in a reading comprehension model.

    Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
    ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
    and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
    fields, which are both ``IndexFields``.

    Parameters
    ----------
    question_tokens : ``List[Token]``
        An already-tokenized question.
    passage_tokens : ``List[Token]``
        An already-tokenized passage that contains the answer to the given question.
    token_indexers : ``Dict[str, TokenIndexer]``
        Determines how the question and passage ``TextFields`` will be converted into tensors that
        get input to a model.  See :class:`TokenIndexer`.
    passage_text : ``str``
        The original passage text.  We need this so that we can recover the actual span from the
        original passage that the model predicts as the answer to the question.  This is used in
        official evaluation scripts.
    token_spans : ``List[Tuple[int, int]]``, optional
        Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
        a list because there might be several possible correct answer spans in the passage.
        Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple
        annotations on the dev set; this will select the span that the most annotators gave as
        correct).
    answer_texts : ``List[str]``, optional
        All valid answer strings for the given question.  In SQuAD, e.g., the training set has
        exactly one answer per question, but the dev and test sets have several.  TriviaQA has many
        possible answers, which are the aliases for the known correct entity.  This is put into the
        metadata for use with official evaluation scripts, but not used anywhere else.
    additional_metadata : ``Dict[str, Any]``, optional
        The constructed ``metadata`` field will by default contain ``original_passage``,
        ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
        you want any other metadata to be associated with each instance, you can pass that in here.
        This dictionary will get added to the ``metadata`` dictionary we already construct.
    """
    additional_metadata = additional_metadata or {}
    #import pdb; pdb.set_trace()
    fields: Dict[str, Field] = {}
    passage_offsets = [(token.idx, token.idx + len(token.text))
                       for token in passage_tokens]

    # This is separate so we can reference it later with a known type.
    passage_field = TextField(passage_tokens, token_indexers)
    fields['passage'] = passage_field
    fields['question'] = TextField(question_tokens, token_indexers)
    metadata = {
        'original_passage': passage_text,
        'token_offsets': passage_offsets,
        'question_tokens': [token.text for token in question_tokens],
        'passage_tokens': [token.text for token in passage_tokens],
    }
    if answer_texts:
        metadata['answer_texts'] = answer_texts

    list_span_start = []
    list_span_end = []
    if token_spans:

        # There may be multiple answer annotations, so we pick the one that occurs the most.  This
        # only matters on the SQuAD dev set, and it means our computed metrics ("start_acc",
        # "end_acc", and "span_acc") aren't quite the same as the official metrics, which look at
        # all of the annotations.  This is why we have a separate official SQuAD metric calculation
        # (the "em" and "f1" metrics use the official script).
        for span_start, span_end in token_spans:
            list_span_start.append(IndexField(span_start, passage_field))
            list_span_end.append(IndexField(span_end, passage_field))

        fields['span_start'] = ListField(list_span_start)
        fields['span_end'] = ListField(list_span_end)

    #import pdb; pdb.set_trace()
    metadata.update(additional_metadata)
    fields['metadata'] = MetadataField(metadata)
    return Instance(fields)

Example #17

0

Show file

File: conll2003.py Project: sumepr/allennlp

    def text_to_instance(  # type: ignore
        self,
        tokens: List[Token],
        pos_tags: List[str] = None,
        chunk_tags: List[str] = None,
        ner_tags: List[str] = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_chunks = (
                to_bioul(chunk_tags, encoding=self._original_coding_scheme)
                if chunk_tags is not None
                else None
            )
            coded_ner = (
                to_bioul(ner_tags, encoding=self._original_coding_scheme)
                if ner_tags is not None
                else None
            )
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if "pos" in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance."
                )
            instance_fields["pos_tags"] = SequenceLabelField(pos_tags, sequence, "pos_tags")
        if "chunk" in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance."
                )
            instance_fields["chunk_tags"] = SequenceLabelField(coded_chunks, sequence, "chunk_tags")
        if "ner" in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance."
                )
            instance_fields["ner_tags"] = SequenceLabelField(coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == "ner" and coded_ner is not None:
            instance_fields["tags"] = SequenceLabelField(coded_ner, sequence, self.label_namespace)
        elif self.tag_label == "pos" and pos_tags is not None:
            instance_fields["tags"] = SequenceLabelField(pos_tags, sequence, self.label_namespace)
        elif self.tag_label == "chunk" and coded_chunks is not None:
            instance_fields["tags"] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace
            )

        return Instance(instance_fields)

Example #18

0

Show file

File: winobias.py Project: zulushakaka/allennlp

    def text_to_instance(
        self,  # type: ignore
        sentence: List[Token],
        gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
    ) -> Instance:
        """
        # Parameters

        sentence : `List[Token]`, required.
            The already tokenised sentence to analyse.
        gold_clusters : `Optional[List[List[Tuple[int, int]]]]`, optional (default = None)
            A list of all clusters in the sentence, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.

        # Returns

        An `Instance` containing the following `Fields`:
            text : `TextField`
                The text of the full sentence.
            spans : `ListField[SpanField]`
                A ListField containing the spans represented as `SpanFields`
                with respect to the sentence text.
            span_labels : `SequenceLabelField`, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a `SequenceLabelField`
                 with respect to the `spans `ListField`.
        """
        metadata: Dict[str, Any] = {"original_text": sentence}
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters

        text_field = TextField(sentence, self._token_indexers)

        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans: List[Field] = []
        span_labels: Optional[
            List[int]] = [] if gold_clusters is not None else None

        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            if span_labels is not None:
                if (start, end) in cluster_dict:
                    span_labels.append(cluster_dict[(start, end)])
                else:
                    span_labels.append(-1)

            spans.append(SpanField(start, end, text_field))

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field,
        }
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)

        return Instance(fields)

Example #19

0

Show file

File: nlvr2_reader.py Project: arjunakula/faithful-nmn

    def text_to_instance(
        self,
        sentence: str,
        identifier: str,
        image_ids: List[str],
        logical_form: str = None,
        attention_mode: int = None,
        box_annotation: Dict = None,
        denotation: str = None,
    ) -> Instance:
        tokenized_sentence = self._tokenizer.tokenize(sentence)
        sentence_field = TextField(tokenized_sentence, self._token_indexers)

        world = VisualReasoningNlvr2Language(None, None, None, None, None,
                                             None)

        production_rule_fields: List[Field] = []
        instance_action_ids: Dict[str, int] = {}
        for production_rule in world.all_possible_productions():
            instance_action_ids[production_rule] = len(instance_action_ids)
            field = ProductionRuleField(production_rule, is_global_rule=True)
            production_rule_fields.append(field)

        action_field = ListField(production_rule_fields)

        boxes2 = []
        feats2 = []
        max_num_boxes = 0
        for key in image_ids:
            if self.img_data is not None:
                img_info = self.img_data[key]
            else:
                split_name = "train"
                if "dev" in key:
                    split_name = "valid"
                img_info = pickle.load(
                    open(
                        os.path.join(self._image_feat_cache_dir,
                                     split_name + "_obj36.tsv", key),
                        "rb",
                    ))
            boxes = img_info["boxes"].copy()
            feats = img_info["features"].copy()
            assert len(boxes) == len(feats)

            # Normalize the boxes (to 0 ~ 1)
            img_h, img_w = img_info["img_h"], img_info["img_w"]
            boxes[..., (0, 2)] /= img_w
            boxes[..., (1, 3)] /= img_h
            np.testing.assert_array_less(boxes, 1 + 1e-5)
            np.testing.assert_array_less(-boxes, 0 + 1e-5)

            if boxes.shape[0] > self._max_boxes:
                boxes = boxes[:self._max_boxes, :]
                feats = feats[:self._max_boxes, :]
            max_num_boxes = max(max_num_boxes, boxes.shape[0])
            boxes2.append(boxes)
            feats2.append(feats)
        boxes3 = [
            np.zeros((max_num_boxes, img_boxes.shape[-1]))
            for img_boxes in boxes2
        ]
        feats3 = [
            np.zeros((max_num_boxes, img_feats.shape[-1]))
            for img_feats in feats2
        ]
        for i in range(len(boxes2)):
            boxes3[i][:boxes2[i].shape[0], :] = boxes2[i]
            feats3[i][:feats2[i].shape[0], :] = feats2[i]
        boxes2 = boxes3
        feats2 = feats3
        feats = np.stack(feats2)
        boxes = np.stack(boxes2)
        metadata: Dict[str, Any] = {
            "utterance": sentence,
            "tokenized_utterance": tokenized_sentence,
            "identifier": identifier,
        }

        fields: Dict[str, Field] = {
            "sentence": sentence_field,
            "actions": action_field,
            "metadata": MetadataField(metadata),
            "image_id": MetadataField(identifier[:-2]),
            "visual_feat": ArrayField(feats),
            "pos": ArrayField(boxes),
        }
        if denotation is not None:
            fields["denotation"] = LabelField(denotation, skip_indexing=True)

        if logical_form:
            lisp_exp = annotation_to_lisp_exp(logical_form)
            target_sequence = world.logical_form_to_action_sequence(lisp_exp)
            index_field = [
                IndexField(instance_action_ids[action], action_field)
                for action in target_sequence
            ]
            fields["target_action_sequence"] = ListField(index_field)

            module_attention = annotation_to_module_attention(logical_form)
            target_attention = target_sequence_to_target_attn(
                target_sequence, module_attention)
            gold_question_attentions = self._assign_attention_to_tokens(
                target_attention, sentence, attention_mode)
            attn_index_field = [
                ListField(
                    [IndexField(att, sentence_field) for att in target_att])
                for target_att in gold_question_attentions
            ]
            fields["gold_question_attentions"] = ListField(attn_index_field)
            if box_annotation is None and len(self.box_annotations) > 0:
                fields["gold_box_annotations"] = MetadataField([])
            elif box_annotation is not None:
                modules = logical_form.split("\n")
                children = [[] for _ in modules]
                for j, module in enumerate(modules):
                    num_periods = len(module) - len(module.strip("."))
                    for k in range(j + 1, len(modules)):
                        num_periods_k = len(modules[k]) - len(
                            modules[k].strip("."))
                        if num_periods_k <= num_periods:
                            break
                        if num_periods_k == num_periods + 1:
                            children[j].append(k)
                for j in range(len(modules) - 1, -1, -1):
                    if modules[j].strip(".") == "in_left_image":
                        box_annotation[j] = {}
                        box_annotation[j]["module"] = modules[j].strip(".")
                        box_annotation[j][0] = box_annotation[j + 1][0]
                        box_annotation[j][1] = []
                        """for k in children[j]:
                            box_annotation[k][0] = box_annotation[k][0]
                            box_annotation[k][1] = []"""
                    elif modules[j].strip(".") == "in_right_image":
                        box_annotation[j] = {}
                        box_annotation[j]["module"] = modules[j].strip(".")
                        box_annotation[j][1] = box_annotation[j + 1][1]
                        box_annotation[j][0] = []
                    elif modules[j].strip(".") in {
                            "in_one_image", "in_other_image"
                    }:
                        box_annotation[j] = {}
                        box_annotation[j]["module"] = modules[j].strip(".")
                        box_annotation[j][0] = box_annotation[j + 1][0]
                        box_annotation[j][1] = box_annotation[j + 1][1]
                        """for k in children[j]:
                            box_annotation[k][0] = []
                            box_annotation[k][1] = box_annotation[k][1]"""
                keys = sorted(list(box_annotation.keys()))
                # print(identifier, keys)
                # print(box_annotation)
                # print(target_sequence)
                module_boxes = [(
                    mod,
                    box_annotation[mod]["module"],
                    [box_annotation[mod][0], box_annotation[mod][1]],
                ) for mod in keys]
                gold_boxes, gold_counts = target_sequence_to_target_boxes(
                    target_sequence, module_boxes, children)
                # print(identifier, target_sequence, module_boxes, gold_boxes)
                fields["gold_box_annotations"] = MetadataField(gold_boxes)
            metadata["gold"] = world.action_sequence_to_logical_form(
                target_sequence)
            fields["valid_target_sequence"] = ArrayField(
                np.array(1, dtype=np.int32))
        else:
            fields["target_action_sequence"] = ListField(
                [IndexField(0, action_field)])
            fields["gold_question_attentions"] = ListField(
                [ListField([IndexField(0, sentence_field)])])
            fields["valid_target_sequence"] = ArrayField(
                np.array(0, dtype=np.int32))
            if len(self.box_annotations) > 0:
                fields["gold_box_annotations"] = MetadataField([])
        return Instance(fields)

Example #20

0

Show file

    def text_to_instance(
            self,  # type: ignore
            question: str,
            table_lines: List[List[str]],
            target_values: List[str],
            offline_search_output: List[str] = None) -> Instance:
        """
        Reads text inputs and makes an instance. WikitableQuestions dataset provides tables as
        TSV files pre-tagged using CoreNLP, which we use for training.

        Parameters
        ----------
        question : ``str``
            Input question
        table_lines : ``List[List[str]]``
            The table content preprocessed by CoreNLP. See ``TableQuestionContext.read_from_lines``
            for the expected format.
        target_values : ``List[str]``
        offline_search_output : List[str], optional
            List of logical forms, produced by offline search. Not required during test.
        """
        # pylint: disable=arguments-differ
        tokenized_question = self._tokenizer.tokenize(question.lower())
        question_field = TextField(tokenized_question,
                                   self._question_token_indexers)
        # TODO(pradeep): We'll need a better way to input CoreNLP processed lines.
        table_context = TableQuestionContext.read_from_lines(
            table_lines, tokenized_question)
        target_values_field = MetadataField(target_values)
        world = WikiTablesVariableFreeWorld(table_context)
        world_field = MetadataField(world)
        # Note: Not passing any featre extractors when instantiating the field below. This will make
        # it use all the available extractors.
        table_field = KnowledgeGraphField(
            table_context.get_table_knowledge_graph(),
            tokenized_question,
            self._table_token_indexers,
            tokenizer=self._tokenizer,
            include_in_vocab=self._use_table_for_vocab,
            max_table_tokens=self._max_table_tokens)
        production_rule_fields: List[Field] = []
        for production_rule in world.all_possible_actions():
            _, rule_right_side = production_rule.split(' -> ')
            is_global_rule = not world.is_instance_specific_entity(
                rule_right_side)
            field = ProductionRuleField(production_rule,
                                        is_global_rule=is_global_rule)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)

        fields = {
            'question': question_field,
            'table': table_field,
            'world': world_field,
            'actions': action_field,
            'target_values': target_values_field
        }

        # We'll make each target action sequence a List[IndexField], where the index is into
        # the action list we made above.  We need to ignore the type here because mypy doesn't
        # like `action.rule` - it's hard to tell mypy that the ListField is made up of
        # ProductionRuleFields.
        action_map = {
            action.rule: i
            for i, action in enumerate(action_field.field_list)
        }  # type: ignore
        if offline_search_output:
            action_sequence_fields: List[Field] = []
            for logical_form in offline_search_output:
                try:
                    expression = world.parse_logical_form(logical_form)
                except ParsingError as error:
                    logger.debug(
                        f'Parsing error: {error.message}, skipping logical form'
                    )
                    logger.debug(f'Question was: {question}')
                    logger.debug(f'Logical form was: {logical_form}')
                    logger.debug(f'Table info was: {table_lines}')
                    continue
                except:
                    logger.error(logical_form)
                    raise
                action_sequence = world.get_action_sequence(expression)
                try:
                    index_fields: List[Field] = []
                    for production_rule in action_sequence:
                        index_fields.append(
                            IndexField(action_map[production_rule],
                                       action_field))
                    action_sequence_fields.append(ListField(index_fields))
                except KeyError as error:
                    logger.debug(
                        f'Missing production rule: {error.args}, skipping logical form'
                    )
                    logger.debug(f'Question was: {question}')
                    logger.debug(f'Table info was: {table_lines}')
                    logger.debug(f'Logical form was: {logical_form}')
                    continue
                if len(action_sequence_fields
                       ) >= self._max_offline_logical_forms:
                    break

            if not action_sequence_fields:
                # This is not great, but we're only doing it when we're passed logical form
                # supervision, so we're expecting labeled logical forms, but we can't actually
                # produce the logical forms.  We should skip this instance.  Note that this affects
                # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the
                # full test data.
                return None
            fields['target_action_sequences'] = ListField(
                action_sequence_fields)
        if self._output_agendas:
            agenda_index_fields: List[Field] = []
            for agenda_string in world.get_agenda(conservative=True):
                agenda_index_fields.append(
                    IndexField(action_map[agenda_string], action_field))
            if not agenda_index_fields:
                agenda_index_fields = [IndexField(-1, action_field)]
            fields['agenda'] = ListField(agenda_index_fields)
        return Instance(fields)

Example #21

0

Show file

File: universal_dependencies.py Project: ahmetustun/udapter

    def text_to_instance(
            self,  # type: ignore
            words: List[str],
            lemmas: List[str] = None,
            lemma_rules: List[str] = None,
            upos_tags: List[str] = None,
            xpos_tags: List[str] = None,
            feats: List[str] = None,
            separate_feats: List[Dict[str, str]] = None,
            dependencies: List[Tuple[str, int]] = None,
            ids: List[str] = None,
            multiword_ids: List[str] = None,
            multiword_forms: List[str] = None,
            langs: List[str] = None) -> Instance:
        fields: Dict[str, Field] = {}

        if self.use_lang_ids:
            # use ent_type_ for lang_ids
            tokens = TextField(
                [Token(text=w, ent_type_=l) for w, l in zip(words, langs)],
                self._token_indexers)
        else:
            tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields["tokens"] = tokens

        names = ["upos", "xpos", "feats", "lemmas", "langs"]
        all_tags = [upos_tags, xpos_tags, feats, lemma_rules, langs]
        for name, field in zip(names, all_tags):
            if field:
                fields[name] = SequenceLabelField(field,
                                                  tokens,
                                                  label_namespace=name)

        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields["head_tags"] = SequenceLabelField(
                [x[0] for x in dependencies],
                tokens,
                label_namespace="head_tags")
            fields["head_indices"] = SequenceLabelField(
                [int(x[1]) for x in dependencies],
                tokens,
                label_namespace="head_index_tags")

        if self.use_separate_feats:
            feature_seq = []
            for feat_set in separate_feats:
                dimensions = {
                    dimension.replace('[', '_').replace(']', '_'): "_"
                    for dimension in self.ud_feats_schema
                }

                if feat_set != "_":
                    for dimension in feat_set:
                        dimensions[dimension.replace('[', '_').replace(
                            ']', '_')] = feat_set[dimension]

                feature_seq.append(dimensions)

            for dimension in self.ud_feats_schema:
                d = dimension.replace('[', '_').replace(']', '_')
                labels = [f[d] for f in feature_seq]
                fields[d] = SequenceLabelField(labels,
                                               tokens,
                                               label_namespace=d)

        fields["metadata"] = MetadataField({
            "words": words,
            "upos_tags": upos_tags,
            "xpos_tags": xpos_tags,
            "feats": feats,
            "lemmas": lemmas,
            "lemma_rules": lemma_rules,
            "ids": ids,
            "multiword_ids": multiword_ids,
            "multiword_forms": multiword_forms,
            "langs": langs
        })

        return Instance(fields)

Example #22

0

Show file

File: fever_bert_reader.py Project: neverneverendup/combine-FEVER-NSMN

    def text_to_instance(
            self,  # type: ignore
            premise: List[Tuple[str, float]],  # Important type information
            hypothesis: str,
            pid: str = None,
            label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        if self.shuffle_sentences:
            # Potential improvement. Shuffle the input sentences. Maybe close this at last several epoch.
            random.shuffle(premise)

        premise_prob_list = []
        premise_tokens_list = []

        for premise_sent, prob in premise:
            tokenized_cur_sent = self.bert_servant.tokenize(
                premise_sent, modify_from_corenlp=True)
            # cur_sent_ids = self.bert_servant.tokens_to_ids(tokenized_cur_sent)

            if self.max_l is not None:
                tokenized_cur_sent = tokenized_cur_sent[:self.
                                                        max_l]  # truncate max length (default 60)

            premise_tokens_list.extend(tokenized_cur_sent)
            prob_value = np.ones(
                (len(tokenized_cur_sent), 1), dtype=np.float32) * prob
            premise_prob_list.append(prob_value)

        premise_prob = np.concatenate(premise_prob_list, axis=0)
        # premise_tokens_id_list = self.bert_servant.tokens_to_ids(premise_tokens_list)

        hypothesis_tokens_list = self.bert_servant.tokenize(
            hypothesis, modify_from_corenlp=True)

        # print("WTF!!!, p", len(premise_tokens_list))
        # print("WTF!!!, h", len(hypothesis_tokens_list))

        if self.max_l is not None:
            hypothesis_tokens_list = hypothesis_tokens_list[:self.max_l]

        hypothesis_prob = np.ones((len(hypothesis_tokens_list), 1),
                                  dtype=np.float32)

        assert len(premise_tokens_list) == len(premise_prob)
        assert len(hypothesis_tokens_list) == len(hypothesis_prob)

        paired_tokens_sequence = ['[CLS]'] + premise_tokens_list + [
            '[SEP]'
        ] + hypothesis_tokens_list + ['[SEP]']
        token_type_ids = [0] * (2 + len(premise_tokens_list)) + [1] * (
            1 + len(hypothesis_tokens_list))

        paired_ids_seq = self.bert_servant.tokens_to_ids(
            paired_tokens_sequence)
        assert len(paired_ids_seq) == len(token_type_ids)
        fields['paired_sequence'] = BertIndexField(
            np.asarray(paired_ids_seq, dtype=np.int64))
        fields['paired_token_type_ids'] = BertIndexField(
            np.asarray(token_type_ids, dtype=np.int64))

        premise_span = (1, 1 + len(premise_tokens_list)
                        )  # End is exclusive (important for later use)
        hypothesis_span = (premise_span[1] + 1,
                           premise_span[1] + 1 + len(hypothesis_tokens_list))

        assert len(paired_ids_seq) == 1 + (premise_span[1] - premise_span[0]) + 1 + \
               (hypothesis_span[1] - hypothesis_span[0]) + 1

        fields['bert_premise_span'] = MetadataField(premise_span)
        fields['bert_hypothesis_span'] = MetadataField(hypothesis_span)

        fields['premise_probs'] = MetadataField(premise_prob)
        fields['hypothesis_probs'] = MetadataField(hypothesis_prob)

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields)

Example #23

0

Show file

File: data.py Project: galsang/korean-syllable-embedding-allennlp

 def text_to_instance(self, source: Token, targets: List[Token] = []) -> Instance:
     fields = {'source': TextField([source], self._syllable_indexers),
               'targets': TextField(targets, self._word_indexers)}
     return Instance(fields)

Example #24

0

Show file

File: proglobal_dataset_reader.py Project: strategist922/propara

    def text_to_instance(self, inputs):
        fields: Dict[str, Field] = {}

        tokens_list_field: List[TextField] = []
        sent_positions_list_field: List[TextField] = []
        position_list_field: List[TextField] = []
        participant_mask_list_field: List[SequenceLabelField] = []
        after_loc_start_list_field: List[IndexField] = []
        after_loc_end_list_field: List[IndexField] = []
        after_category_list_field: List[IndexField] = []
        after_category_mask_list_field: List[IndexField] = []

        category_field: List[LabelField] = []
        for l in category_list:
            category_field.append(LabelField(str(l), "labels"))
        category_field = ListField(category_field)

        category_mask_field: List[LabelField] = []
        for l in category_mask_list:
            category_mask_field.append(LabelField(str(l), "labels"))
        category_mask_field = ListField(category_mask_field)

        token_field_step0 = TextField(inputs[0][0], self._token_indexers)
        before_loc_start_field = IndexField(inputs[6][0], token_field_step0)
        before_loc_end_field = IndexField(inputs[7][0], token_field_step0)
        before_category_field = IndexField(inputs[4][0], category_field)
        before_category_mask_field = IndexField(inputs[5][0], category_mask_field)

        for i in range(len(inputs[0])):
            token_field = TextField(inputs[0][i], self._token_indexers)
            tokens_list_field.append(token_field)

            sent_position_field = TextField(inputs[1][i], self._sent_position_indexers)
            sent_positions_list_field.append(sent_position_field)

            position_field = TextField(inputs[2][i], self._token_position_indexers)
            position_list_field.append(position_field)

            participant_mask_field = SequenceLabelField(inputs[3][i], token_field, 'tags')
            participant_mask_list_field.append(participant_mask_field)

            after_loc_start_field = IndexField(inputs[10][i], token_field)
            after_loc_end_field = IndexField(inputs[11][i], token_field)

            after_loc_start_list_field.append(after_loc_start_field)
            after_loc_end_list_field.append(after_loc_end_field)

            after_category_field = IndexField(inputs[8][i], category_field)
            after_category_list_field.append(after_category_field)

            after_category_mask_field = IndexField(inputs[9][i], category_mask_field)
            after_category_mask_list_field.append(after_category_mask_field)

        fields['tokens_list'] = ListField(tokens_list_field)
        fields['positions_list'] = ListField(position_list_field)
        fields['sent_positions_list'] = ListField(sent_positions_list_field)
        fields['before_loc_start'] = before_loc_start_field
        fields['before_loc_end'] = before_loc_end_field
        fields['after_loc_start_list'] = ListField(after_loc_start_list_field)
        fields['after_loc_end_list'] = ListField(after_loc_end_list_field)
        fields['before_category'] = before_category_field
        fields['after_category_list'] = ListField(after_category_list_field)
        fields['before_category_mask'] = before_category_mask_field
        fields['after_category_mask_list'] = ListField(after_category_mask_list_field)

        return Instance(fields)

Example #25

0

Show file

    def text_to_instance(
        self,  # type: ignore
        sentences: List[List[str]],
        gold_clusters: Optional[List[List[Tuple[int,
                                                int]]]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenised words and sentences in the document.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.
        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full document.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the document text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        flattened_sentences = [
            self._normalize_word(word) for sentence in sentences
            for word in sentence
        ]
        # align clusters
        gold_clusters = self.align_clusters_to_tokens(flattened_sentences,
                                                      gold_clusters)

        def tokenizer(s: str):
            return self.token_indexer.wordpiece_tokenizer(s)

        # we nee dto try this with the other one.
        flattened_sentences = tokenizer(" ".join(flattened_sentences))
        metadata: Dict[str, Any] = {"original_text": flattened_sentences}
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters

        text_field = TextField([Token(["[CLS]"])] +
                               [Token(word) for word in flattened_sentences] +
                               [Token(["[SEP]"])], self._token_indexers)

        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans: List[Field] = []
        span_labels: Optional[
            List[int]] = [] if gold_clusters is not None else None
        sentence_offset = 0
        normal = []
        for sentence in sentences:
            # enumerate the spans.
            for start, end in enumerate_spans(
                    sentence,
                    offset=sentence_offset,
                    max_span_width=self._max_span_width):
                if span_labels is not None:
                    if (start, end) in cluster_dict:
                        span_labels.append(cluster_dict[(start, end)])
                    else:
                        span_labels.append(-1)
                # align the spans to the BERT tokeniation
                normal.append((start, end))
                span_field = TextField(
                    [Token(["[CLS]"])] +
                    [Token(word)
                     for word in flattened_sentences] + [Token(["[SEP]"])],
                    self._token_indexers)
                # span field for Span, which needs to be a flattened esnetnece.
                spans.append(SpanField(start, end, span_field))
            sentence_offset += len(sentence)

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field
        }
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)
        return Instance(fields)

Example #26

0

Show file

File: transition_ucca_reader.py Project: danielhers/hit-scir-ucca-parser

    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            lemmas: List[str] = None,
            pos_tags: List[str] = None,
            arc_indices: List[Tuple[int, int]] = None,
            arc_tags: List[str] = None,
            gold_actions: List[str] = None,
            arc_descendants: List[str] = None,
            root_id: List[int] = None,
            meta_info: List[str] = None,
            tokens_range: List[Tuple[int, int]] = None,
            gold_mrps: List[str] = None,
            deprels: List[str] = None,
            lex_infos: List[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        token_field = TextField([Token(t) for t in tokens],
                                self._token_indexers)

        fields["tokens"] = token_field
        meta_dict = {"tokens": tokens}

        if arc_indices is not None and arc_tags is not None:
            meta_dict["arc_indices"] = arc_indices
            meta_dict["arc_tags"] = arc_tags
            fields["arc_tags"] = TextField([Token(a) for a in arc_tags],
                                           self._arc_tag_indexers)

        if gold_actions is not None:
            meta_dict["gold_actions"] = gold_actions
            fields["gold_actions"] = TextField(
                [Token(a) for a in gold_actions], self._action_indexers)

        if pos_tags is not None and self.pos_tags:
            fields["pos_tags"] = SequenceLabelField(pos_tags,
                                                    token_field,
                                                    label_namespace="pos")
        if arc_descendants is not None:
            meta_dict["arc_descendants"] = arc_descendants

        if root_id is not None:
            meta_dict["root_id"] = root_id[0]

        if meta_info is not None:
            meta_dict["meta_info"] = meta_info[0]

        if tokens_range is not None:
            meta_dict["tokens_range"] = tokens_range

        if gold_mrps is not None:
            meta_dict["gold_mrps"] = gold_mrps[0]
        if deprels is not None and self.deprels:
            fields["deprels"] = SequenceLabelField(deprels,
                                                   token_field,
                                                   label_namespace="deprels")

        if lex_infos is not None:
            bios, lexcat, ss, ss2 = zip(*tuple(lex_infos))
            if self.bios:
                fields["bios"] = SequenceLabelField(bios,
                                                    token_field,
                                                    label_namespace="bios")
            if self.lexcat:
                fields["lexcat"] = SequenceLabelField(lexcat,
                                                      token_field,
                                                      label_namespace="lexcat")
            if self.ss:
                fields["ss"] = SequenceLabelField(ss,
                                                  token_field,
                                                  label_namespace="ss")
            if self.ss2:
                fields["ss2"] = SequenceLabelField(ss2,
                                                   token_field,
                                                   label_namespace="ss2")

        fields["metadata"] = MetadataField(meta_dict)

        return Instance(fields)

Example #27

0

Show file

File: atis.py Project: shavarani/allennlp

    def text_to_instance(
            self,  # type: ignore
            utterances: List[str],
            sql_query_labels: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        utterances: ``List[str]``, required.
            List of utterances in the interaction, the last element is the current utterance.
        sql_query_labels: ``List[str]``, optional
            The SQL queries that are given as labels during training or validation.
        """
        utterance = utterances[-1]
        action_sequence: List[str] = []

        if not utterance:
            return None

        world = AtisWorld(utterances=utterances,
                          database_file=self._database_file)

        if sql_query_labels:
            # If there are multiple sql queries given as labels, we use the shortest
            # one for training.
            sql_query = min(sql_query_labels, key=len)
            try:
                action_sequence = world.get_action_sequence(sql_query)
            except ParseError:
                logger.debug(f'Parsing error')

        tokenized_utterance = self._tokenizer.tokenize(utterance.lower())
        utterance_field = TextField(tokenized_utterance, self._token_indexers)

        production_rule_fields: List[Field] = []

        for production_rule in world.all_possible_actions():
            nonterminal, _ = production_rule.split(' ->')
            # The whitespaces are not semantically meaningful, so we filter them out.
            production_rule = ' '.join([
                token for token in production_rule.split(' ') if token != 'ws'
            ])
            field = ProductionRuleField(production_rule,
                                        self._is_global_rule(nonterminal))
            production_rule_fields.append(field)

        action_field = ListField(production_rule_fields)
        action_map = {
            action.rule: i  # type: ignore
            for i, action in enumerate(action_field.field_list)
        }
        index_fields: List[Field] = []
        world_field = MetadataField(world)
        fields = {
            'utterance': utterance_field,
            'actions': action_field,
            'world': world_field,
            'linking_scores': ArrayField(world.linking_scores)
        }

        if sql_query_labels != None:
            fields['sql_queries'] = MetadataField(sql_query_labels)
            if action_sequence:
                for production_rule in action_sequence:
                    index_fields.append(
                        IndexField(action_map[production_rule], action_field))

                action_sequence_field = ListField(index_fields)
                fields['target_action_sequence'] = action_sequence_field
            else:
                # If we are given a SQL query, but we are unable to parse it, then we will skip it.
                return None

        return Instance(fields)

Example #28

0

Show file

    def text_to_instance(
        self,  # type: ignore
        sentence: str,
        structured_representations: List[List[List[JsonDict]]],
        labels: List[str] = None,
        target_sequences: List[List[str]] = None,
        identifier: str = None,
    ) -> Instance:
        """
        Parameters
        ----------
        sentence : ``str``
            The query sentence.
        structured_representations : ``List[List[List[JsonDict]]]``
            A list of Json representations of all the worlds. See expected format in this class' docstring.
        labels : ``List[str]`` (optional)
            List of string representations of the labels (true or false) corresponding to the
            ``structured_representations``. Not required while testing.
        target_sequences : ``List[List[str]]`` (optional)
            List of target action sequences for each element which lead to the correct denotation in
            worlds corresponding to the structured representations.
        identifier : ``str`` (optional)
            The identifier from the dataset if available.
        """
        worlds = []
        for structured_representation in structured_representations:
            boxes = {
                Box(object_list, box_id)
                for box_id, object_list in enumerate(structured_representation)
            }
            worlds.append(NlvrLanguage(boxes))
        tokenized_sentence = self._tokenizer.tokenize(sentence)
        sentence_field = TextField(tokenized_sentence, self._sentence_token_indexers)
        production_rule_fields: List[Field] = []
        instance_action_ids: Dict[str, int] = {}
        # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change
        # later.
        for production_rule in worlds[0].all_possible_productions():
            instance_action_ids[production_rule] = len(instance_action_ids)
            field = ProductionRuleField(production_rule, is_global_rule=True)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)
        worlds_field = ListField([MetadataField(world) for world in worlds])
        metadata: Dict[str, Any] = {"sentence_tokens": [x.text for x in tokenized_sentence]}
        fields: Dict[str, Field] = {
            "sentence": sentence_field,
            "worlds": worlds_field,
            "actions": action_field,
            "metadata": MetadataField(metadata),
        }
        if identifier is not None:
            fields["identifier"] = MetadataField(identifier)
        # Depending on the type of supervision used for training the parser, we may want either
        # target action sequences or an agenda in our instance. We check if target sequences are
        # provided, and include them if they are. If not, we'll get an agenda for the sentence, and
        # include that in the instance.
        if target_sequences:
            action_sequence_fields: List[Field] = []
            for target_sequence in target_sequences:
                index_fields = ListField(
                    [
                        IndexField(instance_action_ids[action], action_field)
                        for action in target_sequence
                    ]
                )
                action_sequence_fields.append(index_fields)
                # TODO(pradeep): Define a max length for this field.
            fields["target_action_sequences"] = ListField(action_sequence_fields)
        elif self._output_agendas:
            # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true
            # now, but may change later too.
            agenda = worlds[0].get_agenda_for_sentence(sentence)
            assert agenda, "No agenda found for sentence: %s" % sentence
            # agenda_field contains indices into actions.
            agenda_field = ListField(
                [IndexField(instance_action_ids[action], action_field) for action in agenda]
            )
            fields["agenda"] = agenda_field
        if labels:
            labels_field = ListField(
                [LabelField(label, label_namespace="denotations") for label in labels]
            )
            fields["labels"] = labels_field

        return Instance(fields)

Example #29

0

Show file

    def make_marginal_drop_instance(
        question_tokens: List[Token],
        passage_tokens: List[Token],
        number_tokens: List[Token],
        number_indices: List[int],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        answer_info: Dict[str, Any] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Instance:
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        question_offsets = [(token.idx, token.idx + len(token.text))
                            for token in question_tokens]

        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        question_field = TextField(question_tokens, token_indexers)
        fields["passage"] = passage_field
        fields["question"] = question_field
        number_index_fields: List[Field] = [
            IndexField(index, passage_field) for index in number_indices
        ]
        fields["number_indices"] = ListField(number_index_fields)
        # This field is actually not required in the model,
        # it is used to create the `answer_as_plus_minus_combinations` field, which is a `SequenceLabelField`.
        # We cannot use `number_indices` field for creating that, because the `ListField` will not be empty
        # when we want to create a new empty field. That will lead to error.
        numbers_in_passage_field = TextField(number_tokens, token_indexers)
        metadata = {
            "original_passage": passage_text,
            "passage_token_offsets": passage_offsets,
            "question_token_offsets": question_offsets,
            "question_tokens": [token.text for token in question_tokens],
            "passage_tokens": [token.text for token in passage_tokens],
            "number_tokens": [token.text for token in number_tokens],
            "number_indices": number_indices,
        }
        if answer_info:
            metadata["answer_texts"] = answer_info["answer_texts"]

            passage_span_fields: List[Field] = [
                SpanField(span[0], span[1], passage_field)
                for span in answer_info["answer_passage_spans"]
            ]
            if not passage_span_fields:
                passage_span_fields.append(SpanField(-1, -1, passage_field))
            fields["answer_as_passage_spans"] = ListField(passage_span_fields)

            question_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_field)
                for span in answer_info["answer_question_spans"]
            ]
            if not question_span_fields:
                question_span_fields.append(SpanField(-1, -1, question_field))
            fields["answer_as_question_spans"] = ListField(
                question_span_fields)

            add_sub_signs_field: List[Field] = []
            for signs_for_one_add_sub_expression in answer_info[
                    "signs_for_add_sub_expressions"]:
                add_sub_signs_field.append(
                    SequenceLabelField(signs_for_one_add_sub_expression,
                                       numbers_in_passage_field))
            if not add_sub_signs_field:
                add_sub_signs_field.append(
                    SequenceLabelField([0] * len(number_tokens),
                                       numbers_in_passage_field))
            fields["answer_as_add_sub_expressions"] = ListField(
                add_sub_signs_field)

            count_fields: List[Field] = [
                LabelField(count_label, skip_indexing=True)
                for count_label in answer_info["counts"]
            ]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)

        metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)

Example #30

0

Show file

    def text_to_instance(
            self,  # pylint: disable=arguments-differ
            premises: List[str],
            hypotheses: List[str],
            answer_index: int = None,
            relevant_sentence_idxs: List[int] = None) -> Instance:
        fields = {}
        premises_tokens = [
            self._tokenizer.tokenize(premise)[-self._premise_max_tokens:]
            for premise in premises
        ]
        hypotheses_tokens = [
            self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:]
            for hypothesis in hypotheses
        ]
        if premises:
            premises_text_fields = [
                TextField(premise_tokens, self._token_indexers)
                for premise_tokens in premises_tokens
            ]
            premises_field = ListField(premises_text_fields)
        else:
            empty_stub = ListField(
                [TextField([Token('dummy')], self._token_indexers)])
            premises_field = empty_stub.empty_field()
        fields['premises'] = premises_field

        hypotheses_text_fields = [
            TextField(hypothesis_tokens, self._token_indexers)
            for hypothesis_tokens in hypotheses_tokens
        ]
        hypotheses_field = ListField(hypotheses_text_fields)
        fields['hypotheses'] = hypotheses_field

        # If sentence relevance is available
        if relevant_sentence_idxs is not None:
            relevance_presence_mask = np.zeros(len(premises))
            for idx in relevant_sentence_idxs:
                relevance_presence_mask[idx] = 1
            fields['relevance_presence_mask'] = ArrayField(
                np.array(relevance_presence_mask))

        # If entailment labels are available
        if answer_index is not None:
            # if answer_index not in range(0, len(hypotheses)):
            #     raise ConfigurationError("Provided label must be in 0 to {}".format(len(hypotheses)))
            fields['answer_index'] = ArrayField(np.array(answer_index),
                                                padding_value=-1,
                                                dtype=np.long)

        paragraph_tokens = [
            token for premise_tokens in premises_tokens
            for token in premise_tokens
        ]
        #print(len(paragraph_tokens))
        if (len(paragraph_tokens) == 0):
            return None
        paragraph_text_field = TextField(paragraph_tokens,
                                         self._token_indexers)

        fields['paragraph'] = paragraph_text_field
        return Instance(fields)