def text_to_instance(
            self,  # type: ignore
            sentence: str,
            structured_representations: List[List[List[JsonDict]]],
            labels: List[str] = None,
            target_sequences: List[List[str]] = None,
            identifier: str = None) -> Instance:
        """
        Parameters
        ----------
        sentence : ``str``
            The query sentence.
        structured_representations : ``List[List[List[JsonDict]]]``
            A list of Json representations of all the worlds. See expected format in this class' docstring.
        labels : ``List[str]`` (optional)
            List of string representations of the labels (true or false) corresponding to the
            ``structured_representations``. Not required while testing.
        target_sequences : ``List[List[str]]`` (optional)
            List of target action sequences for each element which lead to the correct denotation in
            worlds corresponding to the structured representations.
        identifier : ``str`` (optional)
            The identifier from the dataset if available.
        """
        # pylint: disable=arguments-differ
        worlds = [NlvrWorld(data) for data in structured_representations]
        tokenized_sentence = self._tokenizer.tokenize(sentence)
        sentence_field = TextField(tokenized_sentence,
                                   self._sentence_token_indexers)
        production_rule_fields: List[Field] = []
        instance_action_ids: Dict[str, int] = {}
        # TODO(pradeep): Assuming that possible actions are the same in all worlds. This may change
        # later.
        for production_rule in worlds[0].all_possible_actions():
            instance_action_ids[production_rule] = len(instance_action_ids)
            field = ProductionRuleField(production_rule, is_global_rule=True)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)
        worlds_field = ListField([MetadataField(world) for world in worlds])
        fields: Dict[str, Field] = {
            "sentence": sentence_field,
            "worlds": worlds_field,
            "actions": action_field
        }
        if identifier is not None:
            fields["identifier"] = MetadataField(identifier)
        # Depending on the type of supervision used for training the parser, we may want either
        # target action sequences or an agenda in our instance. We check if target sequences are
        # provided, and include them if they are. If not, we'll get an agenda for the sentence, and
        # include that in the instance.
        if target_sequences:
            action_sequence_fields: List[Field] = []
            for target_sequence in target_sequences:
                index_fields = ListField([
                    IndexField(instance_action_ids[action], action_field)
                    for action in target_sequence
                ])
                action_sequence_fields.append(index_fields)
                # TODO(pradeep): Define a max length for this field.
            fields["target_action_sequences"] = ListField(
                action_sequence_fields)
        elif self._output_agendas:
            # TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true
            # now, but may change later too.
            agenda = worlds[0].get_agenda_for_sentence(
                sentence, add_paths_to_agenda=False)
            assert agenda, "No agenda found for sentence: %s" % sentence
            # agenda_field contains indices into actions.
            agenda_field = ListField([
                IndexField(instance_action_ids[action], action_field)
                for action in agenda
            ])
            fields["agenda"] = agenda_field
        if labels:
            labels_field = ListField([
                LabelField(label, label_namespace='denotations')
                for label in labels
            ])
            fields["labels"] = labels_field

        return Instance(fields)
Exemple #2
0
 def text_to_instance(self, tokens: List[Token]) -> Instance:  # type: ignore
     """
     We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
     """
     # pylint: disable=arguments-differ
     return Instance({'tokens': TextField(tokens, token_indexers=self._token_indexers)})
Exemple #3
0
    def text_to_instance(
            self,  # type: ignore
            utterances,
            sql_query=None):
        # pylint: disable=arguments-differ
        u"""
        Parameters
        ----------
        utterances: ``List[str]``, required.
            List of utterances in the interaction, the last element is the current utterance.
        sql_query: ``str``, optional
            The SQL query, given as label during training or validation.
        """
        utterance = utterances[-1]
        action_sequence = []

        if not utterance:
            return None

        world = AtisWorld(utterances)

        if sql_query:
            try:
                action_sequence = world.get_action_sequence(sql_query)
            except ParseError:
                logger.debug('Parsing error')

        tokenized_utterance = self._tokenizer.tokenize(utterance.lower())
        utterance_field = TextField(tokenized_utterance, self._token_indexers)

        production_rule_fields = []

        for production_rule in world.all_possible_actions():
            lhs, _ = production_rule.split(u' ->')
            is_global_rule = not lhs in [u'number', u'string']
            # The whitespaces are not semantically meaningful, so we filter them out.
            production_rule = u' '.join([
                token for token in production_rule.split(u' ')
                if token != u'ws'
            ])
            field = ProductionRuleField(production_rule, is_global_rule)
            production_rule_fields.append(field)

        action_field = ListField(production_rule_fields)
        action_map = dict((action.rule, i)  # type: ignore
                          for i, action in enumerate(action_field.field_list))
        index_fields = []
        world_field = MetadataField(world)
        fields = {
            u'utterance': utterance_field,
            u'actions': action_field,
            u'world': world_field,
            u'linking_scores': ArrayField(world.linking_scores)
        }

        if sql_query:
            if action_sequence:
                for production_rule in action_sequence:
                    index_fields.append(
                        IndexField(action_map[production_rule], action_field))

                action_sequence_field = []
                action_sequence_field.append(ListField(index_fields))
                fields[u'target_action_sequence'] = ListField(
                    action_sequence_field)
            else:
                # If we are given a SQL query, but we are unable to parse it, then we will skip it.
                return None

        return Instance(fields)
Exemple #4
0
    def __getitem_detector__(self, index):
        item = self.items[index]
        sample = {}
        if self.expanded and index >= self.train_size:
            image_file_name = "COCO_val2014_{:0>12d}.jpg".format(item['image_id'])
        else:
            image_file_name = "COCO_{}2014_{:0>12d}.jpg".format(self.split_name, item['image_id'])

        image_info = self.masks[image_file_name]
        if "train" in image_file_name:
            image_file_path = os.path.join(self.data_root, "train2014", image_file_name)
        elif "val" in image_file_name:
            image_file_path = os.path.join(self.data_root, "val2014", image_file_name)

        ###################################################################
        # Most of things adapted from VCR
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(image_file_path)
        image, window, img_scale, padding = resize_image(image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape
        ###################################################################
        metadata = self.masks[image_file_name] # Get the metadata
        # Load boxes.
        # We will use all detections
        dets2use = np.arange(len(metadata['boxes']))
        # [nobj, 14, 14]
        segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        
        try:
            metadata['names'] = [i.split(" ")[1][1:-1] for i in metadata["names"]]
        except:
            pass
        obj_labels = [self.coco_obj_to_ind[metadata['names'][i]] for i in dets2use.tolist()]
        boxes = np.row_stack((window, boxes))
        segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0)
        obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        sample['segms'] = ArrayField(segms, padding_value=0)
        sample['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        sample['boxes'] = ArrayField(boxes, padding_value=-1)

        caption_a = item["caption"]
        imageID = item["image_id"]
        
        sample["label"] = sample['objects'] # This is an useless field. Just so that they know the batch size.

        if self.expanded and index >= self.train_size:
            coco = self.coco_val
        else:
            coco = self.coco

        rest_anns = coco.loadAnns([i for i in coco.getAnnIds(imgIds=imageID) if i != item['id']])

        if self.args.get("two_sentence", True):
            if random.random() > 0.5:
                item_b = self.items[random.randint(0, len(self.items) - 1)]
                while item_b["image_id"] == imageID:
                    item_b = self.items[random.randint(0, len(self.items) - 1)]
                flag = False
            else:
                item_b = rest_anns[random.randint(0, len(rest_anns) - 1)]
                flag = True # is next sentence

            caption_b = item_b["caption"]
            subword_tokens_a = self.tokenizer.tokenize(caption_a)
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id = index, text_a = subword_tokens_a, text_b = subword_tokens_b, is_correct=flag, max_seq_length = self.max_seq_length)
        elif not self.args.get("no_next_sentence", False):
            if random.random() < self.args.false_caption_ratio:
                item_b = self.items[random.randint(0, len(self.items) - 1)]
                while item_b["image_id"] == imageID:
                    item_b = self.items[random.randint(0, len(self.items) - 1)]
                flag = False
            else:
                item_b = item
                flag = True # is next sentence

            caption_b = item_b["caption"]
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id = index, text_a = subword_tokens_b, text_b = None, is_correct=flag, max_seq_length = self.max_seq_length)
        else:
            subword_tokens_a = self.tokenizer.tokenize(caption_a)
            bert_example = InputExample(unique_id = index, text_a = subword_tokens_a, text_b = None, is_correct=None, max_seq_length = self.max_seq_length)

        bert_feature = InputFeatures.convert_one_example_to_features_pretraining(
                    example = bert_example,
                    tokenizer=self.tokenizer,
                    probability = self.masked_lm_prob)
        bert_feature.insert_field_into_dict(sample)

        return image, Instance(sample)
Exemple #5
0
 def text_to_instance(
         self,
         task_name: str,
         domain_name: str,
         source_string: str,
         target_string: str = None) -> Instance:  # type: ignore
     task_field = LabelField(task_name, label_namespace="task_labels")
     domain_field = LabelField(domain_name, label_namespace="domain_labels")
     # pylint: disable=arguments-differ
     tokenized_source = self._source_tokenizer.tokenize(source_string)
     if self._source_add_start_token:
         tokenized_source.insert(0, Token(START_SYMBOL))
     tokenized_source.append(Token(END_SYMBOL))
     source_field = TextField(tokenized_source, self._source_token_indexers)
     inst = Instance({
         'source_tokens':
         source_field,
         "task_token":
         task_field,
         "domain_token":
         domain_field,
         'upos_tokens':
         TextField(
             [Token(START_SYMBOL), Token(END_SYMBOL)],
             self._upos_token_indexers),
         'ner_tokens':
         TextField(
             [Token(START_SYMBOL), Token(END_SYMBOL)],
             self._ner_token_indexers),
         'chunk_tokens':
         TextField(
             [Token(START_SYMBOL), Token(END_SYMBOL)],
             self._chunk_token_indexers)
     })
     if target_string is not None:
         tokenized_target = self._target_tokenizer.tokenize(target_string)
         tokenized_target.insert(0, Token(START_SYMBOL))
         tokenized_target.append(Token(END_SYMBOL))
         target_field = TextField(tokenized_target,
                                  self._task_to_indexers[task_name])
         if task_name == 'upos':
             inst = Instance({
                 'source_tokens':
                 source_field,
                 "task_token":
                 task_field,
                 "domain_token":
                 domain_field,
                 'upos_tokens':
                 target_field,
                 'ner_tokens':
                 TextField([Token(START_SYMBOL),
                            Token(END_SYMBOL)], self._ner_token_indexers),
                 'chunk_tokens':
                 TextField([Token(START_SYMBOL),
                            Token(END_SYMBOL)], self._chunk_token_indexers)
             })
         if task_name == 'ner':
             inst = Instance({
                 'source_tokens':
                 source_field,
                 "task_token":
                 task_field,
                 "domain_token":
                 domain_field,
                 'upos_tokens':
                 TextField([Token(START_SYMBOL),
                            Token(END_SYMBOL)], self._upos_token_indexers),
                 'ner_tokens':
                 target_field,
                 'chunk_tokens':
                 TextField([Token(START_SYMBOL),
                            Token(END_SYMBOL)], self._chunk_token_indexers)
             })
         if task_name == 'chunk':
             inst = Instance({
                 'source_tokens':
                 source_field,
                 "task_token":
                 task_field,
                 "domain_token":
                 domain_field,
                 'upos_tokens':
                 TextField([Token(START_SYMBOL),
                            Token(END_SYMBOL)], self._upos_token_indexers),
                 'ner_tokens':
                 TextField([Token(START_SYMBOL),
                            Token(END_SYMBOL)], self._ner_token_indexers),
                 'chunk_tokens':
                 target_field
             })
     return inst
Exemple #6
0
    def formatted_text_to_instance(self,  # type: ignore
                                   item_id: Any,
                                   question_text: str,
                                   documents_text_list: List[str],
                                   flattened_p1_list: List[int],
                                   flattened_p1_list_e1wh: List[int],
                                   flattened_p2_list_e1: List[int],
                                   flattened_p2_list: List[int],
                                   flattened_he_locs_list: List[Tuple[int, int]],
                                   flattened_e1wh_locs_list: List[Tuple[int, int]],
                                   flattened_e1_locs_list: List[Tuple[int, int]],
                                   flattened_ca_locs_list: List[Tuple[int, int]],
                                   he_tracks: List[List[int]],
                                   e1wh_tracks: List[List[int]],
                                   e1_tracks: List[List[int]],
                                   ca_tracks: List[List[int]],
                                   max_paths: int,
                                   max_he_locs: int, max_e1wh_locs: int,
                                   max_e1_locs: int, max_ca_locs: int,
                                   choice_text_list: List[str],
                                   all_choice_locs: List[List[Tuple[int, int]]],
                                   all_choice_docidxs: List[List[int]],
                                   answer_id: int) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question_text)
        documents_list_tokens = [self._tokenizer.tokenize(dt) for dt in documents_text_list]
        if len(sum(documents_list_tokens, [])) == 0:
            documents_list_tokens = [question_tokens]

        choices_list_tokens = [self._tokenizer.tokenize(x) for x in choice_text_list]

        fields['question'] = TextField(question_tokens, self._token_indexers)
        document_text_fields = [TextField(x, self._token_indexers) for x in documents_list_tokens]
        document_field = ListField(document_text_fields)
        fields['documents'] = document_field
        fields['candidates'] = ListField([TextField(x, self._token_indexers) for x in choices_list_tokens])

        fields['flattened_p1list'] = ListField([IndexField(x, document_field)
                                                for x in flattened_p1_list])
        fields['flattened_p1list_e1wh'] = ListField([IndexField(x, document_field)
                                                     for x in flattened_p1_list_e1wh])
        fields['flattened_p2list_e1'] = ListField([IndexField(x, document_field)
                                                   for x in flattened_p2_list_e1])
        fields['flattened_p2list'] = ListField([IndexField(x, document_field)
                                                for x in flattened_p2_list])

        fields['flat_he_spans'] = ListField([SpanField(x[0], x[1], document_text_fields[flattened_p1_list[xidx]])
                                             for xidx, x in enumerate(flattened_he_locs_list)])
        fields['flat_e1wh_spans'] = ListField([SpanField(x[0], x[1], document_text_fields[flattened_p1_list_e1wh[xidx]])
                                               for xidx, x in enumerate(flattened_e1wh_locs_list)])
        fields['flat_e1_spans'] = ListField([SpanField(x[0], x[1], document_text_fields[flattened_p2_list_e1[xidx]])
                                             for xidx, x in enumerate(flattened_e1_locs_list)])
        fields['flat_choice_spans'] = ListField([SpanField(x[0], x[1], document_text_fields[flattened_p2_list[xidx]])
                                                 for xidx, x in enumerate(flattened_ca_locs_list)])

        # all choice fields
        all_choice_docidx_field = []
        all_choice_span_fileds = []
        for choice_docidxs, choice_spans in zip(all_choice_docidxs, all_choice_locs):
            all_choice_docidx_field.append(ListField([IndexField(x, document_field)
                                                      for x in choice_docidxs]))
            all_choice_span_fileds.append(ListField([SpanField(x[0], x[1],
                                                               document_text_fields[choice_docidxs[xidx]])
                                                     for xidx, x in enumerate(choice_spans)]))
        fields['all_choice_docidxs'] = ListField(all_choice_docidx_field)
        fields['all_choice_locs'] = ListField(all_choice_span_fileds)

        if answer_id is not None:
            fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
            "id": item_id,
            "question_text": question_text,
            "documents_text": documents_text_list,
            "choice_text_list": choice_text_list,
            "he_tracks": he_tracks,
            "e1wh_tracks": e1wh_tracks,
            "e1_tracks": e1_tracks,
            "choice_tracks": ca_tracks,
            "max_num_paths": max_paths,
            "max_num_he_locs": max_he_locs,
            "max_num_e1wh_locs": max_e1wh_locs,
            "max_num_e1_locs": max_e1_locs,
            "max_num_ca_locs": max_ca_locs,
        }

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
Exemple #7
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            pos_tags: List[str] = None,
            gold_tree: Tree = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        pos_tags ``List[str]``, optional, (default = None).
            The POS tags for the words in the sentence.
        gold_tree : ``Tree``, optional (default = None).
            The gold parse tree to create span labels from.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            pos_tags : ``SequenceLabelField``
                The POS tags of the words in the sentence.
                Only returned if ``use_pos_tags`` is ``True``
            spans : ``ListField[SpanField]``
                A ListField containing all possible subspans of the
                sentence.
            span_labels : ``SequenceLabelField``, optional.
                The constiutency tags for each of the possible spans, with
                respect to a gold parse tree. If a span is not contained
                within the tree, a span will have a ``NO-LABEL`` label.
            gold_tree : ``MetadataField(Tree)``
                The gold NLTK parse tree for use in evaluation.
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        if self._use_pos_tags and pos_tags is not None:
            pos_tag_field = SequenceLabelField(pos_tags, text_field,
                                               "pos_tags")
            fields["pos_tags"] = pos_tag_field
        elif self._use_pos_tags:
            raise ConfigurationError(
                "use_pos_tags was set to True but no gold pos"
                " tags were passed to the dataset reader.")
        spans: List[Field] = []
        gold_labels = []

        if gold_tree is not None:
            gold_spans_with_pos_tags: Dict[Tuple[int, int], str] = {}
            self._get_gold_spans(gold_tree, 0, gold_spans_with_pos_tags)
            gold_spans = {
                span: label
                for (span, label) in gold_spans_with_pos_tags.items()
                if "-POS" not in label
            }
        else:
            gold_spans = None
        for start, end in enumerate_spans(tokens):
            spans.append(SpanField(start, end, text_field))

            if gold_spans is not None:
                if (start, end) in gold_spans.keys():
                    gold_labels.append(gold_spans[(start, end)])
                else:
                    gold_labels.append("NO-LABEL")

        metadata = {"tokens": tokens}
        if gold_tree:
            metadata["gold_tree"] = gold_tree

        fields["metadata"] = MetadataField(metadata)

        span_list_field: ListField = ListField(spans)
        fields["spans"] = span_list_field
        if gold_tree is not None:
            fields["span_labels"] = SequenceLabelField(gold_labels,
                                                       span_list_field)
        return Instance(fields)
Exemple #8
0
    def text_to_instance(self, rule_text, question, scenario, history, answer=None, evidence=None) -> Instance:  # type: ignore
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """
        # pylint: disable=arguments-differ

        if answer and answer in ['Yes', 'No', 'Irrelevant']:
            return None
        target_string = answer

        if self.train_using_gold and answer is not None: # i.e. during training and validation
            predicted_label = answer if answer in ['Yes', 'No', 'Irrelevant'] else 'More'
            predicted_span_ixs = self.dataset_reader.find_lcs(rule_text, answer, self._source_tokenizer.tokenize)
            if predicted_span_ixs is None:
                return None
            else:
                rule_offsets = [(token.idx, token.idx + len(token.text)) for token in self._source_tokenizer.tokenize(rule_text)]
                predicted_span = rule_text[rule_offsets[predicted_span_ixs[0]][0]: rule_offsets[predicted_span_ixs[1]][1]]
        else:
            predicted_span, predicted_label = self.get_prediction(rule_text, question, scenario, history)

        if self.add_rule:
            if self.embed_span:
                source_string = self.get_embedded_span(rule_text, predicted_span)
            else:
                source_string = rule_text + ' @pss@ ' + predicted_span + ' @pse@'
        else:
            source_string = predicted_span
        if self.add_question:
            source_string += ' @qs@ ' + question + ' @qe'
        if self.add_followup_ques:
            for follow_up_qna in history:
                source_string += ' @fs@ ' + follow_up_qna['follow_up_question'] + ' @fe'

        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace)

        meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]}
        fields_dict = {
                "source_tokens": source_field,
                "source_to_target": source_to_target_field,
        }

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]]
            source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] +
                                                              tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2]
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:]
            fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))

        meta_fields['label'] = predicted_label
        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
    def text_to_instance(
            self,  # type: ignore
            question: str,
            table_lines: List[str],
            example_lisp_string: str = None,
            dpd_output: List[str] = None,
            tokenized_question: List[Token] = None) -> Instance:
        """
        Reads text inputs and makes an instance. WikitableQuestions dataset provides tables as TSV
        files, which we use for training.

        Parameters
        ----------
        question : ``str``
            Input question
        table_lines : ``List[str]``
            The table content itself, as a list of rows. See
            ``TableQuestionKnowledgeGraph.read_from_lines`` for the expected format.
        example_lisp_string : ``str``, optional
            The original (lisp-formatted) example string in the WikiTableQuestions dataset.  This
            comes directly from the ``.examples`` file provided with the dataset.  We pass this to
            SEMPRE for evaluating logical forms during training.  It isn't otherwise used for
            anything.
        dpd_output : List[str], optional
            List of logical forms, produced by dynamic programming on denotations. Not required
            during test.
        tokenized_question : ``List[Token]``, optional
            If you have already tokenized the question, you can pass that in here, so we don't
            duplicate that work.  You might, for example, do batch processing on the questions in
            the whole dataset, then pass the result in here.
        """
        # pylint: disable=arguments-differ
        tokenized_question = tokenized_question or self._tokenizer.tokenize(
            question.lower())
        question_field = TextField(tokenized_question,
                                   self._question_token_indexers)
        metadata: Dict[str, Any] = {
            "question_tokens": [x.text for x in tokenized_question]
        }
        metadata["original_table"] = "\n".join(table_lines)
        table_knowledge_graph = TableQuestionKnowledgeGraph.read_from_lines(
            table_lines, tokenized_question)
        table_metadata = MetadataField(table_lines)
        table_field = KnowledgeGraphField(
            table_knowledge_graph,
            tokenized_question,
            self._table_token_indexers,
            tokenizer=self._tokenizer,
            feature_extractors=self._linking_feature_extractors,
            include_in_vocab=self._use_table_for_vocab,
            max_table_tokens=self._max_table_tokens)
        world = WikiTablesWorld(table_knowledge_graph)
        world_field = MetadataField(world)

        production_rule_fields: List[Field] = []
        for production_rule in world.all_possible_actions():
            _, rule_right_side = production_rule.split(' -> ')
            is_global_rule = not world.is_table_entity(rule_right_side)
            field = ProductionRuleField(production_rule, is_global_rule)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)

        fields = {
            'question': question_field,
            'metadata': MetadataField(metadata),
            'table': table_field,
            'world': world_field,
            'actions': action_field
        }
        if self._include_table_metadata:
            fields['table_metadata'] = table_metadata
        if example_lisp_string:
            fields['example_lisp_string'] = MetadataField(example_lisp_string)

        # We'll make each target action sequence a List[IndexField], where the index is into
        # the action list we made above.  We need to ignore the type here because mypy doesn't
        # like `action.rule` - it's hard to tell mypy that the ListField is made up of
        # ProductionRuleFields.
        action_map = {
            action.rule: i
            for i, action in enumerate(action_field.field_list)
        }  # type: ignore
        if dpd_output:
            action_sequence_fields: List[Field] = []
            for logical_form in dpd_output:
                if not self._should_keep_logical_form(logical_form):
                    logger.debug(f'Question was: {question}')
                    logger.debug(f'Table info was: {table_lines}')
                    continue
                try:
                    expression = world.parse_logical_form(logical_form)
                except ParsingError as error:
                    logger.debug(
                        f'Parsing error: {error.message}, skipping logical form'
                    )
                    logger.debug(f'Question was: {question}')
                    logger.debug(f'Logical form was: {logical_form}')
                    logger.debug(f'Table info was: {table_lines}')
                    continue
                except:
                    logger.error(logical_form)
                    raise
                action_sequence = world.get_action_sequence(expression)
                try:
                    index_fields: List[Field] = []
                    for production_rule in action_sequence:
                        index_fields.append(
                            IndexField(action_map[production_rule],
                                       action_field))
                    action_sequence_fields.append(ListField(index_fields))
                except KeyError as error:
                    logger.debug(
                        f'Missing production rule: {error.args}, skipping logical form'
                    )
                    logger.debug(f'Question was: {question}')
                    logger.debug(f'Table info was: {table_lines}')
                    logger.debug(f'Logical form was: {logical_form}')
                    continue
                if len(action_sequence_fields) >= self._max_dpd_logical_forms:
                    break

            if not action_sequence_fields:
                # This is not great, but we're only doing it when we're passed logical form
                # supervision, so we're expecting labeled logical forms, but we can't actually
                # produce the logical forms.  We should skip this instance.  Note that this affects
                # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the
                # full test data.
                return None
            fields['target_action_sequences'] = ListField(
                action_sequence_fields)
        if self._output_agendas:
            agenda_index_fields: List[Field] = []
            for agenda_string in world.get_agenda():
                agenda_index_fields.append(
                    IndexField(action_map[agenda_string], action_field))
            if not agenda_index_fields:
                agenda_index_fields = [IndexField(-1, action_field)]
            fields['agenda'] = ListField(agenda_index_fields)
        return Instance(fields)
Exemple #10
0
    def text_to_instance(
            self,  # type: ignore
            utterances: List[str],
            sql_query_labels: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        utterances: ``List[str]``, required.
            List of utterances in the interaction, the last element is the current utterance.
        sql_query_labels: ``List[str]``, optional
            The SQL queries that are given as labels during training or validation.
        """
        if self._num_turns_to_concatenate:
            utterances[-1] = f' {END_OF_UTTERANCE_TOKEN} '.join(
                utterances[-self._num_turns_to_concatenate:])

        utterance = utterances[-1]
        action_sequence: List[str] = []

        if not utterance:
            return None

        world = AtisWorld(utterances=utterances)

        if sql_query_labels:
            # If there are multiple sql queries given as labels, we use the shortest
            # one for training.
            sql_query = min(sql_query_labels, key=len)
            try:
                action_sequence = world.get_action_sequence(sql_query)
            except ParseError:
                action_sequence = []
                logger.debug(f'Parsing error')

        tokenized_utterance = self._tokenizer.tokenize(utterance.lower())
        utterance_field = TextField(tokenized_utterance, self._token_indexers)

        production_rule_fields: List[Field] = []

        for production_rule in world.all_possible_actions():
            nonterminal, _ = production_rule.split(' ->')
            # The whitespaces are not semantically meaningful, so we filter them out.
            production_rule = ' '.join([
                token for token in production_rule.split(' ') if token != 'ws'
            ])
            field = ProductionRuleField(production_rule,
                                        self._is_global_rule(nonterminal))
            production_rule_fields.append(field)

        action_field = ListField(production_rule_fields)
        action_map = {
            action.rule: i  # type: ignore
            for i, action in enumerate(action_field.field_list)
        }
        index_fields: List[Field] = []
        world_field = MetadataField(world)
        fields = {
            'utterance': utterance_field,
            'actions': action_field,
            'world': world_field,
            'linking_scores': ArrayField(world.linking_scores)
        }

        if sql_query_labels != None:
            fields['sql_queries'] = MetadataField(sql_query_labels)
            if self._keep_if_unparseable or action_sequence:
                for production_rule in action_sequence:
                    index_fields.append(
                        IndexField(action_map[production_rule], action_field))
                if not action_sequence:
                    index_fields = [IndexField(-1, action_field)]
                action_sequence_field = ListField(index_fields)
                fields['target_action_sequence'] = action_sequence_field
            else:
                # If we are given a SQL query, but we are unable to parse it, and we do not specify explicitly
                # to keep it, then we will skip the it.
                return None

        return Instance(fields)
Exemple #11
0
    def text_to_instance(
            self,  # type: ignore
            query: List[str],
            derived_cols: List[Tuple[str, str]],
            derived_tables: List[str],
            prelinked_entities: Dict[str, Dict[str, str]] = None,
            sql: List[str] = None,
            alignment: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        tokens = TextField([Token(t) for t in query], self._token_indexers)
        fields["tokens"] = tokens

        if sql is not None:
            action_sequence, all_actions = self._world.get_action_sequence_and_all_actions(
                query=sql,
                derived_cols=derived_cols,
                derived_tables=derived_tables,
                prelinked_entities=prelinked_entities)
            if action_sequence is None:
                return None

            if alignment is not None:
                # Modify the alignment according to the action sequence
                alignment = AttnSupGrammarBasedWorld.modify_alignment(
                    action_sequence=action_sequence, alignment=alignment)
            else:
                # having a list of NO_ALIGN is basically equivalent to mask all the alignment
                alignment = ['NO_ALIGN'] * len(action_sequence)

        index_fields: List[Field] = []
        production_rule_fields: List[Field] = []

        for production_rule in all_actions:
            nonterminal, _ = production_rule.split(' ->')
            production_rule = ' '.join(production_rule.split(' '))
            field = ProductionRuleField(
                production_rule,
                self._world.is_global_rule(nonterminal),
                nonterminal=nonterminal)
            production_rule_fields.append(field)

        valid_actions_field = ListField(production_rule_fields)
        fields["valid_actions"] = valid_actions_field

        action_map = {
            action.rule: i  # type: ignore
            for i, action in enumerate(valid_actions_field.field_list)
        }

        for production_rule in action_sequence:
            index_fields.append(
                IndexField(action_map[production_rule], valid_actions_field))
        if not action_sequence:
            index_fields = [IndexField(-1, valid_actions_field)]
        # if not action_sequence and re.findall(r"COUNT \( \* \) (?:<|>|<>|=) 0", " ".join(sql)):
        #     index_fields = [IndexField(-2, valid_actions_field)]

        action_sequence_field = ListField(index_fields)
        fields["action_sequence"] = action_sequence_field

        alignment_index_fields: List[IndexField] = []
        tmp_tokens_as_strings = [t.text for t in tokens]
        for aligned_token in alignment:
            try:
                aligned_token_index = int(
                    tmp_tokens_as_strings.index(aligned_token))
                alignment_index_fields.append(
                    IndexField(aligned_token_index, tokens))
            except ValueError as e:
                # a special "no alignment" index
                alignment_index_fields.append(
                    IndexField(-1, tokens.empty_field()))
        fields["alignment_sequence"] = ListField(alignment_index_fields)

        return Instance(fields)
Exemple #12
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            pos_tags: List[str] = None,
            chunk_tags: List[str] = None,
            ner_tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField(
            {"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == "BIOUL":
            coded_chunks = to_bioul(chunk_tags,
                                    encoding=self._original_coding_scheme
                                    ) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags,
                                 encoding=self._original_coding_scheme
                                 ) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if 'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use pos_tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['pos_tags'] = SequenceLabelField(
                pos_tags, sequence, "pos_tags")
        if 'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use chunk tags as "
                    "features. Pass them to text_to_instance.")
            instance_fields['chunk_tags'] = SequenceLabelField(
                coded_chunks, sequence, "chunk_tags")
        if 'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use NER tags as "
                    " features. Pass them to text_to_instance.")
            instance_fields['ner_tags'] = SequenceLabelField(
                coded_ner, sequence, "ner_tags")

        # Add "tag label" to instance
        if self.tag_label == 'ner' and coded_ner is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_ner, sequence, self.label_namespace)
        elif self.tag_label == 'pos' and pos_tags is not None:
            instance_fields['tags'] = SequenceLabelField(
                pos_tags, sequence, self.label_namespace)
        elif self.tag_label == 'chunk' and coded_chunks is not None:
            instance_fields['tags'] = SequenceLabelField(
                coded_chunks, sequence, self.label_namespace)

        return Instance(instance_fields)
Exemple #13
0
 def text_to_instance(self, line: str) -> Instance:  # type: ignore
     # pylint: disable=arguments-differ
     tokens = self._tokenizer.tokenize(line)
     return Instance({"line": TextField(tokens, self._token_indexers)})
Exemple #14
0
    def text_to_instance(
            self,
            question_text: str,
            passage_text: str,
            passage_tokens: List[Token],
            passage_spans: List[Tuple[int, int]],
            numbers_in_passage: List[Any],
            number_words: List[str],
            number_indices: List[int],
            number_len: List[int],
            question_id: str = None,
            passage_id: str = None,
            answer_annotations: List[Dict] = None,
            count_gold_spans_text: List[str] = None) -> Union[Instance, None]:
        # Tokenize question and passage
        question_tokens = self.tokenizer.tokenize(question_text)
        qlen = len(question_tokens)
        plen = len(passage_tokens)

        question_passage_tokens = [Token('[CLS]')] + question_tokens + [
            Token('[SEP]')
        ] + passage_tokens
        if len(question_passage_tokens) > self.max_pieces - 1:
            question_passage_tokens = question_passage_tokens[:self.
                                                              max_pieces - 1]
            passage_tokens = passage_tokens[:self.max_pieces - qlen - 3]
            plen = len(passage_tokens)
            number_indices, number_len, numbers_in_passage = \
                clipped_passage_num(number_indices, number_len, numbers_in_passage, plen)

        question_passage_tokens += [Token('[SEP]')]
        number_indices = [index + qlen + 2 for index in number_indices] + [-1]
        # Not done in-place so they won't change the numbers saved for the passage
        number_len = number_len + [1]
        numbers_in_passage = numbers_in_passage + [0]
        number_tokens = [Token(str(number)) for number in numbers_in_passage]
        extra_number_tokens = [Token(str(num)) for num in self.extra_numbers]

        mask_indices = [0, qlen + 1, len(question_passage_tokens) - 1]

        fields: Dict[str, Field] = {}

        # Add feature fields
        question_passage_field = TextField(question_passage_tokens,
                                           self.token_indexers)
        fields["question_passage"] = question_passage_field

        number_token_indices = \
            [ArrayField(np.arange(start_ind, start_ind + number_len[i]), padding_value=-1)
             for i, start_ind in enumerate(number_indices)]
        fields["number_indices"] = ListField(number_token_indices)
        numbers_in_passage_field = TextField(number_tokens,
                                             self.token_indexers)
        extra_numbers_field = TextField(extra_number_tokens,
                                        self.token_indexers)
        all_numbers_field = TextField(extra_number_tokens + number_tokens,
                                      self.token_indexers)
        mask_index_fields: List[Field] = [
            IndexField(index, question_passage_field) for index in mask_indices
        ]
        fields["mask_indices"] = ListField(mask_index_fields)

        # Compile question, passage, answer metadata
        metadata = {
            "original_passage": passage_text,
            "original_question": question_text,
            "original_numbers": numbers_in_passage,
            "original_number_words": number_words,
            "extra_numbers": self.extra_numbers,
            "passage_tokens": passage_tokens,
            "question_tokens": question_tokens,
            "question_passage_tokens": question_passage_tokens,
            "passage_id": passage_id,
            "question_id": question_id
        }

        if self.extract_spans:
            metadata["passage_spans"] = passage_spans

        if count_gold_spans_text is not None:
            metadata["count_gold_spans_text"] = count_gold_spans_text

        if answer_annotations:
            for annotation in answer_annotations:
                tokenized_spans = [[
                    token.text for token in self.tokenizer.tokenize(answer)
                ] for answer in annotation['spans']]
                annotation['spans'] = [
                    tokenlist_to_passage(token_list)
                    for token_list in tokenized_spans
                ]

            # Get answer type, answer text, tokenize
            answer_type, answer_texts = DropReader.extract_answer_info_from_annotation(
                answer_annotations[0])
            tokenized_answer_texts = []
            num_spans = min(len(answer_texts), self.max_spans)
            for answer_text in answer_texts:
                answer_tokens = self.tokenizer.tokenize(answer_text)
                tokenized_answer_texts.append(' '.join(
                    token.text for token in answer_tokens))

            metadata["answer_annotations"] = answer_annotations
            metadata["answer_texts"] = answer_texts
            metadata["answer_tokens"] = tokenized_answer_texts

            # Find answer text in question and passage
            valid_question_spans = DropReader.find_valid_spans(
                question_tokens, tokenized_answer_texts)
            for span_ind, span in enumerate(valid_question_spans):
                valid_question_spans[span_ind] = (span[0] + 1, span[1] + 1)
            valid_passage_spans = DropReader.find_valid_spans(
                passage_tokens, tokenized_answer_texts)
            for span_ind, span in enumerate(valid_passage_spans):
                valid_passage_spans[span_ind] = (span[0] + qlen + 2,
                                                 span[1] + qlen + 2)

            # Get target numbers
            target_numbers = []
            for answer_text in answer_texts:
                number = self.word_to_num(answer_text)
                if number is not None:
                    target_numbers.append(number)

            # Get possible ways to arrive at target numbers with add/sub

            valid_expressions: List[List[int]] = []
            exp_strings = None
            if answer_type in ["number", "date"]:
                if self.exp_search == 'full':
                    expressions = get_full_exp(
                        list(enumerate(self.extra_numbers +
                                       numbers_in_passage)), target_numbers,
                        self.operations, self.op_dict, self.max_depth)
                    zipped = list(zip(*expressions))
                    if zipped:
                        valid_expressions = list(zipped[0])
                        exp_strings = list(zipped[1])
                elif self.exp_search == 'add_sub':
                    valid_expressions = \
                        DropReader.find_valid_add_sub_expressions(self.extra_numbers + numbers_in_passage,
                                                                  target_numbers,
                                                                  self.max_numbers_expression)
                elif self.exp_search == 'template':
                    valid_expressions, exp_strings = \
                        get_template_exp(self.extra_numbers + numbers_in_passage,
                                         target_numbers,
                                         self.templates,
                                         self.template_strings)
                    exp_strings = sum(exp_strings, [])

            # Get possible ways to arrive at target numbers with counting
            valid_counts: List[int] = []
            if answer_type in ["number"]:
                numbers_for_count = list(range(self.max_count + 1))
                valid_counts = DropReader.find_valid_counts(
                    numbers_for_count, target_numbers)

            # Update metadata with answer info
            answer_info = {
                "answer_passage_spans": valid_passage_spans,
                "answer_question_spans": valid_question_spans,
                "num_spans": num_spans,
                "expressions": valid_expressions,
                "counts": valid_counts
            }
            if self.exp_search in ['template', 'full']:
                answer_info['expr_text'] = exp_strings
            metadata["answer_info"] = answer_info

            # Add answer fields
            passage_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_passage_field)
                for span in valid_passage_spans
            ]
            if not passage_span_fields:
                passage_span_fields.append(
                    SpanField(-1, -1, question_passage_field))
            fields["answer_as_passage_spans"] = ListField(passage_span_fields)

            question_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_passage_field)
                for span in valid_question_spans
            ]
            if not question_span_fields:
                question_span_fields.append(
                    SpanField(-1, -1, question_passage_field))
            fields["answer_as_question_spans"] = ListField(
                question_span_fields)

            if self.exp_search == 'add_sub':
                add_sub_signs_field: List[Field] = []
                extra_signs_field: List[Field] = []
                for signs_for_one_add_sub_expressions in valid_expressions:
                    extra_signs = signs_for_one_add_sub_expressions[:len(
                        self.extra_numbers)]
                    normal_signs = signs_for_one_add_sub_expressions[
                        len(self.extra_numbers):]
                    add_sub_signs_field.append(
                        SequenceLabelField(normal_signs,
                                           numbers_in_passage_field))
                    extra_signs_field.append(
                        SequenceLabelField(extra_signs, extra_numbers_field))
                if not add_sub_signs_field:
                    add_sub_signs_field.append(
                        SequenceLabelField([0] * len(number_tokens),
                                           numbers_in_passage_field))
                if not extra_signs_field:
                    extra_signs_field.append(
                        SequenceLabelField([0] * len(self.extra_numbers),
                                           extra_numbers_field))
                fields["answer_as_expressions"] = ListField(
                    add_sub_signs_field)
                if self.extra_numbers:
                    fields["answer_as_expressions_extra"] = ListField(
                        extra_signs_field)
            elif self.exp_search in ['template', 'full']:
                expression_indices = []
                for expression in valid_expressions:
                    if not expression:
                        expression.append(3 * [-1])
                    expression_indices.append(
                        ArrayField(np.array(expression), padding_value=-1))
                if not expression_indices:
                    expression_indices = \
                        [ArrayField(np.array([3 * [-1]]), padding_value=-1) for _ in range(len(self.templates))]
                fields["answer_as_expressions"] = ListField(expression_indices)

            count_fields: List[Field] = [
                LabelField(count_label, skip_indexing=True)
                for count_label in valid_counts
            ]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)

            fields["num_spans"] = LabelField(num_spans, skip_indexing=True)

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
    def text_to_instance(self, origin_obj: Any) -> Instance:

        prev_obj = origin_obj['prev']
        fol_obj = origin_obj['follow']

        abs_prev_tokens, col_counter, val_counter = abstract_utterance(prev_obj)
        abs_fol_tokens, _, _ = abstract_utterance(fol_obj, col_counter, val_counter)

        # token level tokenizing
        prev_tokens = self._tokenizer.tokenize(" ".join(abs_prev_tokens))
        prev_tokens = TextField(prev_tokens, self._token_indexers)

        fol_tokens = self._tokenizer.tokenize(" ".join(abs_fol_tokens))
        fol_tokens = TextField(fol_tokens, self._token_indexers)

        # char level tokenizing
        prev_tag_tokens = []
        prev_anno: StandardSpan = origin_obj["prev"]
        for ind, tag in enumerate(prev_anno.tags):
            if tag is None:
                prev_tag_tokens.append(prev_tokens[ind].text)
            elif tag.class_type in COLUMN_BIND_TYPES:
                prev_tag_tokens.append(tag.header.replace(" ", "_"))
            elif tag.class_type in VALUE_BIND_TYPES:
                if len(tag.header) > 0:
                    prev_tag_tokens.append(tag.header[0].replace(" ", "_"))
                else:
                    prev_tag_tokens.append(tag.origin.replace(" ", "_"))
            else:
                prev_tag_tokens.append(prev_tokens[ind].text)

        fol_char_str = []
        fol_anno: StandardSpan = origin_obj["follow"]
        for ind, tag in enumerate(fol_anno.tags):
            if tag is None:
                fol_char_str.append(fol_tokens[ind].text)
            elif tag.class_type in COLUMN_BIND_TYPES:
                fol_char_str.append(tag.header.replace(" ", "_"))
            elif tag.class_type in VALUE_BIND_TYPES:
                if len(tag.header) > 0:
                    fol_char_str.append(tag.header[0].replace(" ", "_"))
                else:
                    fol_char_str.append(tag.origin.replace(" ", "_"))
            else:
                fol_char_str.append(fol_tokens[ind].text)

        # split into char-based tokens
        prev_tag_str = " ".join(prev_tag_tokens)
        prev_tag_tokens = self._tokenizer.tokenize(prev_tag_str)
        prev_tag_field = TextField(prev_tag_tokens, self._char_indexers)

        fol_tag_str = " ".join(fol_char_str)
        fol_char_str = self._tokenizer.tokenize(fol_tag_str)
        fol_tag_field = TextField(fol_char_str, self._char_indexers)

        fields = {'prev_tokens': prev_tokens,
                  'fol_tokens': fol_tokens,
                  'prev_tags': prev_tag_field,
                  'fol_tags': fol_tag_field}

        metadata = {"origin_obj": origin_obj,
                    "tokens_origin": abs_prev_tokens + abs_fol_tokens}
        metadata_field = MetadataField(metadata)
        fields['metadata'] = metadata_field

        # pre-training object caching
        prev_snippets = origin_obj['prev'].snippet
        fol_snippets = origin_obj['follow'].snippet
        conflict = origin_obj['conflicts']

        origin_obj.pop('conflicts')

        prev_labels = SequenceLabelField(prev_snippets, prev_tokens)
        fields['prev_labels'] = prev_labels
        fol_labels = SequenceLabelField(fol_snippets, fol_tokens)
        fields['fol_labels'] = fol_labels
        conflict_field = MetadataField(conflict)
        fields['conflicts'] = conflict_field

        fields['metadata'].metadata['origin_obj']['prev_labels'] = prev_snippets
        fields['metadata'].metadata['origin_obj']['fol_labels'] = fol_snippets

        return Instance(fields)
    def _json_blob_to_instance(self, json_obj: JsonDict) -> Instance:
        question_tokens = self._read_tokens_from_json_list(
            json_obj['question_tokens'])
        question_field = TextField(question_tokens,
                                   self._question_token_indexers)
        question_metadata = MetadataField(
            {"question_tokens": [x.text for x in question_tokens]})
        table_knowledge_graph = TableQuestionKnowledgeGraph.read_from_lines(
            json_obj['table_lines'], question_tokens)
        entity_tokens = [
            self._read_tokens_from_json_list(token_list)
            for token_list in json_obj['entity_texts']
        ]
        table_field = KnowledgeGraphField(
            table_knowledge_graph,
            question_tokens,
            tokenizer=None,
            token_indexers=self._table_token_indexers,
            entity_tokens=entity_tokens,
            linking_features=json_obj['linking_features'],
            include_in_vocab=self._use_table_for_vocab,
            max_table_tokens=self._max_table_tokens)
        world = WikiTablesWorld(table_knowledge_graph)
        world_field = MetadataField(world)

        production_rule_fields: List[Field] = []
        for production_rule in world.all_possible_actions():
            _, rule_right_side = production_rule.split(' -> ')
            is_global_rule = not world.is_table_entity(rule_right_side)
            field = ProductionRuleField(production_rule, is_global_rule)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)

        example_string_field = MetadataField(json_obj['example_lisp_string'])

        fields = {
            'question': question_field,
            'metadata': question_metadata,
            'table': table_field,
            'world': world_field,
            'actions': action_field,
            'example_lisp_string': example_string_field
        }

        if 'target_action_sequences' in json_obj or 'agenda' in json_obj:
            action_map = {
                action.rule: i
                for i, action in enumerate(action_field.field_list)
            }  # type: ignore
        if 'target_action_sequences' in json_obj:
            action_sequence_fields: List[Field] = []
            for sequence in json_obj['target_action_sequences']:
                index_fields: List[Field] = []
                for production_rule in sequence:
                    index_fields.append(
                        IndexField(action_map[production_rule], action_field))
                action_sequence_fields.append(ListField(index_fields))
            fields['target_action_sequences'] = ListField(
                action_sequence_fields)
        if 'agenda' in json_obj:
            agenda_index_fields: List[Field] = []
            for agenda_action in json_obj['agenda']:
                agenda_index_fields.append(
                    IndexField(action_map[agenda_action], action_field))
            fields['agenda'] = ListField(agenda_index_fields)
        return Instance(fields)
Exemple #17
0
    def text_to_instance(self,
                         source_string: str,
                         target_lang: str,
                         target_string: str = None) -> Instance:
        """
        Turn raw source string and target string into an ``Instance``.
        Parameters
        ----------
        source_string : ``str``, required
        target_lang : ``str``, required
        target_string : ``str``, optional (default = None)
        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source[1:-1], self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]]
        }

        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        if self._provide_trg_lang:
            lang_id_field = LabelField(
                target_lang, label_namespace=self._language_id_namespace)
            metadata_trg_lang = MetadataField(target_lang)

            fields_dict["target_lang"] = lang_id_field
            fields_dict["target_language"] = metadata_trg_lang

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source) - 2]
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ) - 2:]
            fields_dict["target_token_ids"] = ArrayField(
                np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
Exemple #18
0
    def text_to_instance(self,  # type: ignore
                         document_id: str,
                         part_number: str,
                         sentences: List[List[str]],
                         gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        document_id: ``str``, required.
            The id of the document.
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenised words and sentences in the document.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full document.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the document text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        flattened_sentences = [self._normalize_word(word)
                               for sentence in sentences
                               for word in sentence]

        metadata: Dict[str, Any] = {
            "document_id": document_id,
            "part_number": part_number,
            "original_text": flattened_sentences,
        }

        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters

        text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers)

        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans: List[Field] = []
        span_labels: Optional[List[int]] = [] if gold_clusters is not None else None

        sentence_offset = 0
        for sentence in sentences:
            for start, end in enumerate_spans(sentence,
                                              offset=sentence_offset,
                                              max_span_width=self._max_span_width):
                if span_labels is not None:
                    if (start, end) in cluster_dict:
                        span_labels.append(cluster_dict[(start, end)])
                    else:
                        span_labels.append(-1)

                spans.append(SpanField(start, end, text_field))
            sentence_offset += len(sentence)

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {"text": text_field,
                                    "spans": span_field,
                                    "metadata": metadata_field}
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)

        return Instance(fields)
Exemple #19
0
    def text_to_instance(  # type: ignore
        self, tokens: List[Token], verb_label: List[int], img, tags: List[str] = None
    ) -> Instance:
        """
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """

        metadata_dict: Dict[str, Any] = {}
        if self.bert_tokenizer is not None:
            wordpieces, offsets, start_offsets = self._wordpiece_tokenize_input(
                [t.text for t in tokens]
            )
            new_verbs = _convert_verb_indices_to_wordpiece_indices(verb_label, offsets)
            metadata_dict["offsets"] = start_offsets
            # In order to override the indexing mechanism, we need to set the `text_id`
            # attribute directly. This causes the indexing to use this id.
            text_field = TextField(
                [Token(t, text_id=self.bert_tokenizer.vocab[t]) for t in wordpieces],
                token_indexers=self._token_indexers,
            )
            verb_indicator = SequenceLabelField(new_verbs, text_field)

        else:
            text_field = TextField(tokens, token_indexers=self._token_indexers)
            verb_indicator = SequenceLabelField(verb_label, text_field)
            #? Maybe other options???
            img_feats = img['features'].copy()
            img_boxes = img['boxes'].copy()
            obj_num = img['num_boxes']
            assert len(img_feats) == len(img_boxes) == obj_num

            # Normalize the boxes to 0 ~ 1
            img_boxes = img_boxes.copy()
            img_boxes[:, (0, 2)] /= img['img_w']
            img_boxes[:, (1, 3)] /= img['img_h']
            np.testing.assert_array_less(img_boxes, 1+1e-5)
            np.testing.assert_array_less(-img_boxes, 0+1e-5)

        # Concat box feats to each object features
        img_concat = np.hstack((img_feats, img_boxes))
        img_field = ArrayField(img_concat)

        fields: Dict[str, Field] = {}
        fields["tokens"] = text_field
        fields["verb_indicator"] = verb_indicator
        fields["img_emb"] = img_field

        if all([x == 0 for x in verb_label]):
            verb = None
            verb_index = None
        else:
            verb_index = verb_label.index(1)
            verb = tokens[verb_index].text

        metadata_dict["words"] = [x.text for x in tokens]
        metadata_dict["verb"] = verb
        metadata_dict["verb_index"] = verb_index

        if tags:
            if self.bert_tokenizer is not None:
                new_tags = _convert_tags_to_wordpiece_tags(tags, offsets)
                fields["tags"] = SequenceLabelField(new_tags, text_field)
            else:
                fields["tags"] = SequenceLabelField(tags, text_field)
            metadata_dict["gold_tags"] = tags

        fields["metadata"] = MetadataField(metadata_dict)

        return Instance(fields)
Exemple #20
0
    def text_to_instance(
        self,  # type: ignore
        sentence: List[Token],
        gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
    ) -> Instance:
        """
        Parameters
        ----------
        sentence : ``List[Token]``, required.
            The already tokenised sentence to analyse.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the sentence, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full sentence.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the sentence text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        metadata: Dict[str, Any] = {"original_text": sentence}
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters

        text_field = TextField(sentence, self._token_indexers)

        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans: List[Field] = []
        span_labels: Optional[
            List[int]] = [] if gold_clusters is not None else None

        for start, end in enumerate_spans(sentence,
                                          max_span_width=self._max_span_width):
            if span_labels is not None:
                if (start, end) in cluster_dict:
                    span_labels.append(cluster_dict[(start, end)])
                else:
                    span_labels.append(-1)

            spans.append(SpanField(start, end, text_field))

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field,
        }
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)

        return Instance(fields)
Exemple #21
0
    def __getitem__(self, index):
        if self.image_feature_type == "r2c":
           return self.__getitem_detector__(index)

        item = self.items[index]
        sample = {}
        if not self.text_only:
            image_feat_variable, image_boxes, image_dim_variable = self.get_image_features_by_training_index(index)
            image_feat_variable = ArrayField(image_feat_variable)
            image_dim_variable = IntArrayField(np.array(image_dim_variable))
            sample["image_feat_variable"] = image_feat_variable
            sample["image_dim_variable"] = image_dim_variable
            sample["label"] = image_dim_variable
        else:
            sample["label"] = IntArrayField(np.array([0]))

        caption_a = item["caption"]
        imageID = item["image_id"]

        if self.expanded and index >= self.train_size:
            coco = self.coco_val
        else:
            coco = self.coco

        rest_anns = coco.loadAnns([i for i in coco.getAnnIds(imgIds=imageID) if i != item['id']])

        if self.args.get("two_sentence", True):
            if random.random() > 0.5:
                item_b = self.items[random.randint(0, len(self.items) - 1)]
                while item_b["image_id"] == imageID:
                    item_b = self.items[random.randint(0, len(self.items) - 1)]
                flag = False
            else:
                item_b = rest_anns[random.randint(0, len(rest_anns) - 1)]
                flag = True

            caption_b = item_b["caption"]
            subword_tokens_a = self.tokenizer.tokenize(caption_a)
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id = index, text_a = subword_tokens_a, text_b = subword_tokens_b, is_correct=flag, max_seq_length = self.max_seq_length)
        elif not self.args.get("no_next_sentence", False):
            if random.random() < self.args.false_caption_ratio:
                item_b = self.items[random.randint(0, len(self.items) - 1)]
                while item_b["image_id"] == imageID:
                    item_b = self.items[random.randint(0, len(self.items) - 1)]
                flag = False
            else:
                item_b = item
                flag = True

            caption_b = item_b["caption"]
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id = index, text_a = subword_tokens_b, text_b = None, is_correct=flag, max_seq_length = self.max_seq_length)
        else:
            caption_b = item["caption"]
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id = index, text_a = subword_tokens_b, text_b = None, is_correct=None, max_seq_length = self.max_seq_length)

        bert_feature = InputFeatures.convert_one_example_to_features_pretraining(
                    example = bert_example,
                    tokenizer=self.tokenizer,
                    probability = self.masked_lm_prob)
        bert_feature.insert_field_into_dict(sample)

        return Instance(sample)
Exemple #22
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            verb_label: List[int],
            verb_index: int,
            constituents: List[List[str]] = None,
            srl_args: List[List[str]] = None) -> Instance:
        """
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ

        # Input fields.
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        verb_field = SequenceLabelField(verb_label, text_field)
        target_field = IndexField(verb_index, text_field)

        # Span-based output fields.
        span_starts: List[Field] = []
        span_ends: List[Field] = []
        span_mask: List[int] = [
            1 for _ in range(len(tokens) * self.max_span_width)
        ]
        span_labels: Optional[List[str]] = [] if srl_args is not None else None
        constit_labels: Optional[
            List[str]] = [] if constituents is not None else None

        for j in range(len(tokens)):
            for diff in range(self.max_span_width):
                width = diff
                if j - diff < 0:
                    # This is an invalid span.
                    span_mask[j * self.max_span_width + diff] = 0
                    width = j

                span_starts.append(IndexField(j - width, text_field))
                span_ends.append(IndexField(j, text_field))

                if srl_args:
                    current_label = srl_args[j][diff]
                    span_labels.append(current_label)

                if constituents:
                    label = constituents[j][diff]
                    constit_labels.append(label)

        start_fields = ListField(span_starts)
        end_fields = ListField(span_ends)
        span_mask_fields = SequenceLabelField(span_mask, start_fields)

        fields: Dict[str, Field] = {
            'tokens': text_field,
            'verb_indicator': verb_field,
            'target_index': target_field,
            'span_starts': start_fields,
            'span_ends': end_fields,
            'span_mask': span_mask_fields
        }

        if srl_args:
            fields['tags'] = SequenceLabelField(span_labels, start_fields)
        if constituents:
            fields['constituents'] = SequenceLabelField(
                constit_labels, start_fields, label_namespace="constit_labels")
        return Instance(fields)
Exemple #23
0
    def make_marginal_drop_instance(
        question_tokens: List[Token],
        passage_tokens: List[Token],
        number_tokens: List[Token],
        number_indices: List[int],
        token_indexers: Dict[str, TokenIndexer],
        passage_text: str,
        answer_info: Dict[str, Any] = None,
        additional_metadata: Dict[str, Any] = None,
    ) -> Instance:
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}
        passage_offsets = [(token.idx, token.idx + len(token.text))
                           for token in passage_tokens]
        question_offsets = [(token.idx, token.idx + len(token.text))
                            for token in question_tokens]

        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        question_field = TextField(question_tokens, token_indexers)
        fields["passage"] = passage_field
        fields["question"] = question_field
        number_index_fields: List[Field] = [
            IndexField(index, passage_field) for index in number_indices
        ]
        fields["number_indices"] = ListField(number_index_fields)
        # This field is actually not required in the model,
        # it is used to create the `answer_as_plus_minus_combinations` field, which is a `SequenceLabelField`.
        # We cannot use `number_indices` field for creating that, because the `ListField` will not be empty
        # when we want to create a new empty field. That will lead to error.
        numbers_in_passage_field = TextField(number_tokens, token_indexers)
        metadata = {
            "original_passage": passage_text,
            "passage_token_offsets": passage_offsets,
            "question_token_offsets": question_offsets,
            "question_tokens": [token.text for token in question_tokens],
            "passage_tokens": [token.text for token in passage_tokens],
            "number_tokens": [token.text for token in number_tokens],
            "number_indices": number_indices,
        }
        if answer_info:
            metadata["answer_texts"] = answer_info["answer_texts"]

            passage_span_fields: List[Field] = [
                SpanField(span[0], span[1], passage_field)
                for span in answer_info["answer_passage_spans"]
            ]
            if not passage_span_fields:
                passage_span_fields.append(SpanField(-1, -1, passage_field))
            fields["answer_as_passage_spans"] = ListField(passage_span_fields)

            question_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_field)
                for span in answer_info["answer_question_spans"]
            ]
            if not question_span_fields:
                question_span_fields.append(SpanField(-1, -1, question_field))
            fields["answer_as_question_spans"] = ListField(
                question_span_fields)

            add_sub_signs_field: List[Field] = []
            for signs_for_one_add_sub_expression in answer_info[
                    "signs_for_add_sub_expressions"]:
                add_sub_signs_field.append(
                    SequenceLabelField(signs_for_one_add_sub_expression,
                                       numbers_in_passage_field))
            if not add_sub_signs_field:
                add_sub_signs_field.append(
                    SequenceLabelField([0] * len(number_tokens),
                                       numbers_in_passage_field))
            fields["answer_as_add_sub_expressions"] = ListField(
                add_sub_signs_field)

            count_fields: List[Field] = [
                LabelField(count_label, skip_indexing=True)
                for count_label in answer_info["counts"]
            ]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)

        metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)
Exemple #24
0
 def test_instances_must_have_homogeneous_fields(self):
     instance1 = Instance({"tag": (LabelField(1))})
     instance2 = Instance({"words": TextField(["hello"], {})})
     with pytest.raises(ConfigurationError):
         _ = Dataset([instance1, instance2])
Exemple #25
0
    def text_to_instance(  # type: ignore
        self,
        question: str,
        table_lines: List[List[str]],
        target_values: List[str] = None,
        offline_search_output: List[str] = None,
    ) -> Instance:
        """
        Reads text inputs and makes an instance. We pass the ``table_lines`` to ``TableQuestionContext``, and that
        method accepts this field either as lines from CoreNLP processed tagged files that come with the dataset,
        or simply in a tsv format where each line corresponds to a row and the cells are tab-separated.

        Parameters
        ----------
        question : ``str``
            Input question
        table_lines : ``List[List[str]]``
            The table content optionally preprocessed by CoreNLP. See ``TableQuestionContext.read_from_lines``
            for the expected format.
        target_values : ``List[str]``, optional
            Target values for the denotations the logical forms should execute to. Not required for testing.
        offline_search_output : ``List[str]``, optional
            List of logical forms, produced by offline search. Not required during test.
        """

        tokenized_question = self._tokenizer.tokenize(question.lower())
        question_field = TextField(tokenized_question,
                                   self._question_token_indexers)
        metadata: Dict[str, Any] = {
            "question_tokens": [x.text for x in tokenized_question]
        }
        table_context = TableQuestionContext.read_from_lines(
            table_lines, tokenized_question)
        world = WikiTablesLanguage(table_context)
        world_field = MetadataField(world)
        # Note: Not passing any featre extractors when instantiating the field below. This will make
        # it use all the available extractors.
        table_field = KnowledgeGraphField(
            table_context.get_table_knowledge_graph(),
            tokenized_question,
            self._table_token_indexers,
            tokenizer=self._tokenizer,
            include_in_vocab=self._use_table_for_vocab,
            max_table_tokens=self._max_table_tokens,
        )
        production_rule_fields: List[Field] = []
        for production_rule in world.all_possible_productions():
            _, rule_right_side = production_rule.split(" -> ")
            is_global_rule = not world.is_instance_specific_entity(
                rule_right_side)
            field = ProductionRuleField(production_rule,
                                        is_global_rule=is_global_rule)
            production_rule_fields.append(field)
        action_field = ListField(production_rule_fields)

        fields = {
            "question": question_field,
            "metadata": MetadataField(metadata),
            "table": table_field,
            "world": world_field,
            "actions": action_field,
        }

        if target_values is not None:
            target_values_field = MetadataField(target_values)
            fields["target_values"] = target_values_field

        # We'll make each target action sequence a List[IndexField], where the index is into
        # the action list we made above.  We need to ignore the type here because mypy doesn't
        # like `action.rule` - it's hard to tell mypy that the ListField is made up of
        # ProductionRuleFields.
        action_map = {
            action.rule: i
            for i, action in enumerate(action_field.field_list)  # type: ignore
        }
        if offline_search_output:
            action_sequence_fields: List[Field] = []
            for logical_form in offline_search_output:
                try:
                    action_sequence = world.logical_form_to_action_sequence(
                        logical_form)
                    index_fields: List[Field] = []
                    for production_rule in action_sequence:
                        index_fields.append(
                            IndexField(action_map[production_rule],
                                       action_field))
                    action_sequence_fields.append(ListField(index_fields))
                except ParsingError as error:
                    logger.debug(
                        f"Parsing error: {error.message}, skipping logical form"
                    )
                    logger.debug(f"Question was: {question}")
                    logger.debug(f"Logical form was: {logical_form}")
                    logger.debug(f"Table info was: {table_lines}")
                    continue
                except KeyError as error:
                    logger.debug(
                        f"Missing production rule: {error.args}, skipping logical form"
                    )
                    logger.debug(f"Question was: {question}")
                    logger.debug(f"Table info was: {table_lines}")
                    logger.debug(f"Logical form was: {logical_form}")
                    continue
                except:  # noqa
                    logger.error(logical_form)
                    raise
                if len(action_sequence_fields
                       ) >= self._max_offline_logical_forms:
                    break

            if not action_sequence_fields:
                # This is not great, but we're only doing it when we're passed logical form
                # supervision, so we're expecting labeled logical forms, but we can't actually
                # produce the logical forms.  We should skip this instance.  Note that this affects
                # _dev_ and _test_ instances, too, so your metrics could be over-estimates on the
                # full test data.
                return None
            fields["target_action_sequences"] = ListField(
                action_sequence_fields)
        if self._output_agendas:
            agenda_index_fields: List[Field] = []
            for agenda_string in world.get_agenda(conservative=True):
                agenda_index_fields.append(
                    IndexField(action_map[agenda_string], action_field))
            if not agenda_index_fields:
                agenda_index_fields = [IndexField(-1, action_field)]
            fields["agenda"] = ListField(agenda_index_fields)
        return Instance(fields)
Exemple #26
0
    def __call__(self,
                 instances: Iterable[Instance],
                 num_epochs: int = None,
                 shuffle: bool = False) -> Iterator[TensorDict]:

        key = id(instances)
        starting_epoch = self._epochs[key]

        # In order to ensure that we are (almost) constantly streaming data to the model we
        # need to have all of the instances in memory ($$$)
        instance_list = list(instances)

        if (self._batch_size > len(instance_list)) and self._truncate:
            raise ConfigurationError('FancyIterator will not return any data when the batch size '
                                     'is larger than number of instances and truncation is enabled. '
                                     'To fix this either use a smaller batch size (better for '
                                     'training) or disable truncation (better for validation).')

        if num_epochs is None:
            epochs: Iterable[int] = itertools.count(starting_epoch)
        else:
            epochs = range(starting_epoch, starting_epoch + num_epochs)

        for epoch in epochs:

            if shuffle:
                random.shuffle(instance_list)

            # We create queues for each instance in the batch, and greedily fill them to try and
            # ensure each queue's length is roughly equal in size.
            queues: List[Deque[Instance]] = [deque() for _ in range(self._batch_size)]
            queue_lengths = np.zeros(self._batch_size, dtype=int)
            for instance in instances:

                # Now we split the instance into chunks.
                chunks, length = self._split(instance)

                # Next we identify which queue is the shortest and add the chunks to that queue.
                destination = np.argmin(queue_lengths)
                queues[destination].extend(chunks)
                queue_lengths[destination] += length

            # We need a NULL instance to replace the output of an exhausted queue if we are evaluating
            prototype = deepcopy(chunks[-1])
            new_fields: Dict[str, Field] = {}
            for name, field in prototype.fields.items():
                if isinstance(field, MetadataField):
                    new_fields[name] = field
                else:
                    new_fields[name] = field.empty_field()
            blank_instance = Instance(new_fields)

            for batch in self._generate_batches(queues, blank_instance):
                if self._track_epoch:
                    add_epoch_number(batch, epoch)

                if self.vocab is not None:
                    batch.index_instances(self.vocab)

                padding_lengths = batch.get_padding_lengths()
                yield batch.as_tensor_dict(padding_lengths), 1

            self._epochs[key] = epoch + 1
    def _read(self, file_path: str):
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)

        span_file = open(self._span_file_path)

        logger.info("Reading the dataset")
        for data, best_span in zip(dataset, span_file):
            answer = data['answers'][0]
            question = data['query']
            well_formed_answer = data['wellFormedAnswers'][0]
            passages_json = data['passages']
            passages = [passages_json[i]['passage_text'] for i in range(len(passages_json))]
            passages_is_selected = [passages_json[i]['is_selected'] for i in range(len(passages_json))]
            
            normalized_answer = util.normalize_text_msmarco(answer)
            tokenized_answer = self._tokenizer.tokenize(normalized_answer)
            # set question field
            normalized_question = util.normalize_text_msmarco(question)
            tokenized_question = self._tokenizer.tokenize(normalized_question)
            question_field = TextField(tokenized_question, self._token_indexers)
            fields = {'question': question_field}
            # get preprocessed span
            start_idx, end_idx, rouge_score, passage_idx = None, None, None, None
            start_idx, end_idx, passage_idx, rouge_score = best_span.strip().split(' ')
            start_idx, end_idx, passage_idx, rouge_score = int(start_idx), int(end_idx), int(passage_idx), float(rouge_score)
            # skip contexts that have less than 4 paragraphs
            if len(passages) < 4:
                continue
            # only train instances with rouge score larger than 0.9
            if rouge_score > 0.9:
                # rank passsages based on tf-idf score
                passage_features = self._tfidf.fit_transform(passages)
                question_features = self._tfidf.transform([normalized_question])
                distances = pairwise_distances(question_features, passage_features, "cosine").ravel()
                sorted_passages = np.lexsort((passages, distances))
                # choose 4 passages with highest tf-idf score
                selected_passages = []
                ## choose golden passage first
                normalized_passage = util.normalize_text_msmarco(passages[passage_idx])
                tokenized_passage = self._tokenizer.tokenize(normalized_passage)
                passage_field = TextField(tokenized_passage, self._token_indexers)
                selected_passages.append(passage_field)
                ## set span field from golden passage
                span_start_field = IndexField(start_idx, passage_field)
                span_end_field = IndexField(end_idx, passage_field)
                fields['span_start'] = span_start_field
                fields['span_end'] = span_end_field
                ## choose three others with highest tf-idf score
                idx = 0
                while len(selected_passages) < 4:
                    if sorted_passages[idx] != passage_idx:
                        normalized_passage = util.normalize_text_msmarco(passages[sorted_passages[idx]])
                        tokenized_passage = self._tokenizer.tokenize(normalized_passage)
                        passage_field = TextField(tokenized_passage, self._token_indexers)
                        selected_passages.append(passage_field)
                    idx += 1
                fields['passage'] = ListField(selected_passages)
                yield Instance(fields)
Exemple #28
0
    def text_to_instance(
            self,  # type: ignore
            sentences: List[List[str]],
            document_id: str,
            sentence_id: int,
            gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
            user_threshold: Optional[float] = 0.0) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenised words and sentences in the document.
        document_id : ``str``, required.
            A string representing the document ID.
        sentence_id : ``int``, required.
            An int representing the sentence ID.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.
        user_threshold: ``Optional[float]``, optional (default = 0.0)
            approximate % of gold labels to label to hold out as user input.
            EX = 0.5, 0.33, 0.25, 0.125

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full document.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the document text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        flattened_sentences = [
            self._normalize_word(word) for sentence in sentences
            for word in sentence
        ]

        metadata: Dict[str, Any] = {
            "original_text": flattened_sentences,
            "ID": document_id + ";" + str(sentence_id)
        }
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters
            metadata["num_gold_clusters"] = len(gold_clusters)

        text_field = TextField([Token(word) for word in flattened_sentences],
                               self._token_indexers)

        user_threshold_mod = int(
            1 / user_threshold
        ) if self._simulate_user_inputs and user_threshold > 0 else 0
        cluster_dict = {}
        simulated_user_cluster_dict = {}

        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for i in range(len(cluster)):
                    # use modulo to have a relatively even distribution of user labels across length of document,
                    # (since clusters are sorted)--so user simulated clusters are spread evenly across document
                    if user_threshold_mod == 0 or i % user_threshold_mod != user_threshold_mod - 1:
                        cluster_dict[tuple(cluster[i])] = cluster_id
                    simulated_user_cluster_dict[tuple(cluster[i])] = cluster_id

        # Note simulated_user_cluster_dict encompasses ALL gold labels, including those in cluster_dict
        # Consequently user_labels encompasses all gold labels
        spans: List[Field] = []
        if gold_clusters is not None:
            span_labels: Optional[List[int]] = []
            user_labels: Optional[List[
                int]] = [] if self._simulate_user_inputs and user_threshold > 0 else None
        else:
            span_labels = user_labels = None

        # our must-link and cannot-link constraints, derived from user labels
        # using gold_clusters being None as an indicator of whether we're running training or not
        # TODO: confirm ^^
        must_link: Optional[
            List[int]] = [] if gold_clusters is not None else None
        cannot_link: Optional[
            List[int]] = [] if gold_clusters is not None else None

        sentence_offset = 0
        for sentence in sentences:
            for start, end in enumerate_spans(
                    sentence,
                    offset=sentence_offset,
                    max_span_width=self._max_span_width):
                if span_labels is not None:
                    if (start, end) in cluster_dict:
                        span_labels.append(cluster_dict[(start, end)])
                    else:
                        span_labels.append(-1)
                    if self._simulate_user_inputs and user_threshold > 0:
                        if (start, end) in simulated_user_cluster_dict:
                            user_labels.append(
                                simulated_user_cluster_dict[(start, end)])
                        else:
                            user_labels.append(-1)

                spans.append(SpanField(start, end, text_field))
            sentence_offset += len(sentence)

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field
        }
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)
            if user_labels is not None:
                fields["user_labels"] = SequenceLabelField(
                    user_labels, span_field)

        return Instance(fields)
Exemple #29
0
    def text_to_instance(self, dialog: Dict, ignore_fact: bool = False):
        msg_texts = []
        msg_senders = []
        msg_likes = []
        msg_acts = []
        msg_act_mask = []
        msg_facts = []
        msg_fact_labels = []
        metadata_fact_labels = []
        if len(dialog['messages']) == 0:
            raise ValueError('There are no dialog messages')

        known_entities = [
            Token(text='ENTITY/' + t.replace(' ', '_'), idx=idx)
            for idx, t in enumerate(dialog['known_entities'])
        ]
        if len(known_entities) == 0:
            known_entities.append(Token(text='@@YOUKNOWNOTHING@@', idx=0))
        known_entities_field = TextField(known_entities,
                                         self._mention_indexers)

        focus_entity = dialog['focus_entity']
        focus_entity_field = TextField(
            [Token(text='ENTITY/' + focus_entity.replace(' ', '_'), idx=0)],
            self._mention_indexers)
        prev_msg = ''
        for msg in dialog['messages']:
            if True:
                if prev_msg == '':
                    cur_message = msg['message']
                else:
                    if len(prev_msg) > DIALOG_MAX_LENGTH:
                        prev_msg = ' '.join(
                            prev_msg[-DIALOG_MAX_LENGTH:].split(' ')[1:])
                    cur_message = prev_msg + ' ' + msg['message']
                prev_msg = cur_message
            else:
                cur_message = msg['message']

            tokenized_msg = self._tokenizer.tokenize(cur_message)
            msg_texts.append(TextField(tokenized_msg, self._token_indexers))
            msg_senders.append(0 if msg['sender'] == USER else 1)
            msg_likes.append(
                LabelField('liked' if msg['liked'] else 'not_liked',
                           label_namespace='like_labels'))
            if msg['dialog_acts'] is None:
                dialog_acts = ['@@NODA@@']
                act_mask = 0
            else:
                dialog_acts = msg['dialog_acts']
                act_mask = 1
            dialog_acts_field = MultiLabelFieldListCompat(
                dialog_acts, label_namespace=DIALOG_ACT_LABELS)
            msg_acts.append(dialog_acts_field)
            msg_act_mask.append(act_mask)
            curr_facts_text = []
            curr_facts_labels = []
            curr_metadata_fact_labels = []
            if msg['sender'] == ASSISTANT:
                for idx, f in enumerate(msg['facts']):
                    if ignore_fact:
                        fact_text = 'dummy fact'
                    else:
                        fact = self._fact_lookup[f['fid']]
                        fact_text = fact.text
                    # TODO: These are already space tokenized
                    tokenized_fact = self._tokenizer.tokenize(fact_text)
                    # 99% of text length is 77
                    tokenized_fact = tokenized_fact[:DIALOG_MAX_LENGTH]
                    curr_facts_text.append(
                        TextField(tokenized_fact, self._token_indexers))
                    if f['used']:
                        curr_facts_labels.append(idx)
                        curr_metadata_fact_labels.append(idx)
            else:
                # Users don't have facts, but lets avoid divide by zero
                curr_facts_text.append(
                    TextField([Token(text='@@NOFACT@@', idx=0)],
                              self._token_indexers))

            msg_facts.append(ListField(curr_facts_text))
            # Add in a label if there are no correct indices
            if len(curr_facts_labels) == 0:
                curr_metadata_fact_labels.append(-1)
            n_facts = len(curr_facts_text)
            fact_label_arr = np.zeros(n_facts, dtype=np.float32)
            if len(curr_facts_labels) > 0:
                fact_label_arr[curr_facts_labels] = 1
            msg_fact_labels.append(ArrayField(fact_label_arr,
                                              dtype=np.float32))
            metadata_fact_labels.append(curr_metadata_fact_labels)

        return Instance({
            'messages':
            ListField(msg_texts),
            'facts':
            ListField(msg_facts),
            'fact_labels':
            ListField(msg_fact_labels),
            'likes':
            ListField(msg_likes),
            'dialog_acts':
            ListField(msg_acts),
            'dialog_acts_mask':
            to_long_field(msg_act_mask),
            'senders':
            to_long_field(msg_senders),
            'focus_entity':
            focus_entity_field,
            'known_entities':
            known_entities_field,
            'metadata':
            MetadataField({
                'dialog_id': dialog['dialog_id'],
                'n_message': len(msg_texts),
                'fact_labels': metadata_fact_labels,
                'known_entities': dialog['known_entities'],
                'focus_entity': dialog['focus_entity']
            })
        })
Exemple #30
0
    def __getitem__(self, index):

        if self.complete_shuffle:
            if self.pretraining_include_qa_and_qar:
                index = index // 8
                which = index % 8
            else:
                index = index // 4
                which = index % 4
        else:
            which = None

        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers
        
        answer_choices = item['{}_choices'.format(self.mode)]

        if self.complete_shuffle and which < 4:
            only_use_answer = True
        else:
            only_use_answer = False

        if self.complete_shuffle and which >= 4:
            only_use_qar = True
        else:
            only_use_qar = False

        dets2use, old_det_to_new_ind = self._get_dets_to_use(item, only_use_answer = only_use_answer, only_use_qar = only_use_qar)

        # The only_use_qar is ambigious...

        instance_dict = {}
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True)
        instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'],
                                                   'img_fn': item['img_fn'],
                                                   'question_number': item['question_number']})

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(self.vcr_image_dir, item['img_fn']))
        #image = self.imagedatas(item['img_fn'])

        image, window, img_scale, padding = resize_image(image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(self.vcr_image_dir, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        examples = data_iter_item(item, tokenizer=self.tokenizer,
                                            max_seq_length=self.max_seq_length,
                                            endingonly=False,
                                            include_qar = self.pretraining_include_qa_and_qar,
                                            only_qar = self.only_qar)
        self.getitem_bert_part(examples, item, instance_dict, which)

        if self.use_alignment: # Alignment between objects and text
            ######################
            examples_alginment_pack = []
            for i in range(len(examples)):
                if self.pretraining_include_qa_and_qar:
                    if i < 4:
                        raw_text_a = item["question"]
                        raw_text_b = item['answer_choices'][i]
                    else:
                        raw_text_a = item["question"] + item['answer_choices'][item['answer_label']]
                        raw_text_b = item['rationale_choices'][i - 4]
                elif self.only_qar:
                    raw_text_a = item["question"] + item['answer_choices'][item['answer_label']] # This is the correct alignment right now.
                    raw_text_b = item['rationale_choices'][i]
                else:
                    raw_text_a = item["question"]
                    raw_text_b = item['answer_choices'][i]

                true_text_a = examples[i][0].text_a
                true_text_b = examples[i][0].text_b
                text_alignment_a = examples[i][1]
                text_alignment_b = examples[i][2]

                examples_alginment_pack.append((raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b))

            image_box_position = []

            if which is not None:
                raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b = examples_alginment_pack[which]
                box_record = defaultdict(list)
                self.get_alignment_original(raw_text_a, text_alignment_a, old_det_to_new_ind, box_record, offset = 1)
                self.get_alignment_original(raw_text_b, text_alignment_b, old_det_to_new_ind, box_record, offset = 1 + len(text_alignment_a) + 1)
                image_text_alignment = ListField([IntArrayField(np.array(box_record[i]), padding_value = -1) for i in range(len(boxes))])
            else:
                for raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b in examples_alginment_pack:

                    box_record = defaultdict(list)
                    self.get_alignment_original(raw_text_a, text_alignment_a, old_det_to_new_ind, box_record, offset = 1)
                    self.get_alignment_original(raw_text_b, text_alignment_b, old_det_to_new_ind, box_record, offset = 1 + len(text_alignment_a) + 1)

                    image_box_position.append(ListField([IntArrayField(np.array(box_record[i]), padding_value = -1) for i in range(len(boxes))]))

                image_text_alignment = ListField(image_box_position)
            ######################

            instance_dict["image_text_alignment"] = image_text_alignment

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return image, instance