Example #1
0
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        if file_path.endswith("zip"):
            archive = zipfile.ZipFile(file_path, "r")
            data_file = archive.open(os.path.basename(file_path)[:-4])
        else:
            data_file = open(file_path, "r")

        logger.info("Reading instances from lines in file at: %s", file_path)

        dialogs = json.load(data_file)

        for dial_name in dialogs:
            dialog = dialogs[dial_name]["log"]
            for turn in dialog:
                tokens = turn["text"].split()
                spans = turn["span_info"]
                tags = []
                domain = "None"
                intent = "None"
                for i in range(len(tokens)):
                    for span in spans:
                        if i == span[3]:
                            new_domain, new_intent = span[0].split("-", 1)
                            if domain == "None":
                                domain = new_domain
                            elif domain != new_domain:
                                continue
                            if intent == "None":
                                intent = new_intent
                            elif intent != new_intent:
                                continue
                            tags.append("B-" + span[1])
                            break
                        if i > span[3] and i <= span[4]:
                            new_domain, new_intent = span[0].split("-", 1)
                            if domain != new_domain:
                                continue
                            if intent != new_intent:
                                continue
                            tags.append("I-" + span[1])
                            break
                    else:
                        tags.append("O")

                if domain != "None":
                    assert intent != "None", "intent must not be None when domain is not None"
                elif turn["dialog_act"] != {}:
                    assert intent == "None", "intent must be None when domain is None"
                    di = list(turn["dialog_act"].keys())[0]
                    dai = turn["dialog_act"][di][0]
                    domain = di.split("-")[0]
                    intent = di.split("-", 1)[-1] + "+" + dai[0] + "*" + dai[1]
                # print(turn["dialog_act"])
                # print(domain)
                # print(intent)
                # print(tags)
                #     for dacts in turn["dialog_act"]:
                #         for dact in turn["dialog_act"][dacts]:
                #             if dacts not in dialog_act:
                #                 dialog_act[dacts] = turn["dialog_act"][dacts]
                #                 break
                #             elif dact[0] not in [sv[0] for sv in dialog_act[dacts]]:
                #                 dialog_act[dacts].append(dact)
                # domains = set()
                # intents = set()
                # for dacts in turn["dialog_act"]:
                #     for dact in turn["dialog_act"][dacts]:
                #         domains.add(dacts.split("-")[0])
                # intents = []
                # for dacts in turn["dialog_act"]:
                #     for dact in turn["dialog_act"][dacts]:
                #         if dacts not in dialog_act or  dact[0] not in [sv[0] for sv in dialog_act[dacts]]:
                #             intents.append(dacts+"+"+dact[0]+"*"+dact[1])

                #         if dact[0] == "none":
                #             intents.add(dacts.split("-")[1])
                #         else:
                #             intents.add(dacts.split("-")[1]+"+"+dact[0])
                # if domain == "None" and len(domains) > 0:
                #     domain = random.choice(list(domains))
                # if intent == "None" and len(intents) > 0:
                #     intent = random.choice(list(intents))

                dialog_act = {}
                for dacts in turn["span_info"]:
                    if dacts[0] not in dialog_act:
                        dialog_act[dacts[0]] = []
                    dialog_act[dacts[0]].append(
                        [dacts[1], " ".join(tokens[dacts[3]:dacts[4] + 1])])

                for dacts in turn["dialog_act"]:
                    for dact in turn["dialog_act"][dacts]:
                        if dacts not in dialog_act:
                            dialog_act[dacts] = turn["dialog_act"][dacts]
                            break
                        elif dact[0] not in [
                                sv[0] for sv in dialog_act[dacts]
                        ]:
                            dialog_act[dacts].append(dact)

                tokens = [Token(token) for token in tokens]

                # yield self.text_to_instance(tokens, tags, domain, intent, turn["dialog_act"])
                yield self.text_to_instance(tokens, tags, domain, intent,
                                            dialog_act)
Example #2
0
    def text_to_instance(  # type: ignore
        self, tokens: List[Token], verb_label: List[int], img, tags: List[str] = None
    ) -> Instance:
        """
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """

        metadata_dict: Dict[str, Any] = {}
        if self.bert_tokenizer is not None:
            wordpieces, offsets, start_offsets = self._wordpiece_tokenize_input(
                [t.text for t in tokens]
            )
            new_verbs = _convert_verb_indices_to_wordpiece_indices(verb_label, offsets)
            metadata_dict["offsets"] = start_offsets
            # In order to override the indexing mechanism, we need to set the `text_id`
            # attribute directly. This causes the indexing to use this id.
            text_field = TextField(
                [Token(t, text_id=self.bert_tokenizer.vocab[t]) for t in wordpieces],
                token_indexers=self._token_indexers,
            )
            verb_indicator = SequenceLabelField(new_verbs, text_field)

        else:
            text_field = TextField(tokens, token_indexers=self._token_indexers)
            verb_indicator = SequenceLabelField(verb_label, text_field)
            #? Maybe other options???
            img_feats = img['features'].copy()
            img_boxes = img['boxes'].copy()
            obj_num = img['num_boxes']
            assert len(img_feats) == len(img_boxes) == obj_num

            # Normalize the boxes to 0 ~ 1
            img_boxes = img_boxes.copy()
            img_boxes[:, (0, 2)] /= img['img_w']
            img_boxes[:, (1, 3)] /= img['img_h']
            np.testing.assert_array_less(img_boxes, 1+1e-5)
            np.testing.assert_array_less(-img_boxes, 0+1e-5)

        # Concat box feats to each object features
        img_concat = np.hstack((img_feats, img_boxes))
        img_field = ArrayField(img_concat)

        fields: Dict[str, Field] = {}
        fields["tokens"] = text_field
        fields["verb_indicator"] = verb_indicator
        fields["img_emb"] = img_field

        if all([x == 0 for x in verb_label]):
            verb = None
            verb_index = None
        else:
            verb_index = verb_label.index(1)
            verb = tokens[verb_index].text

        metadata_dict["words"] = [x.text for x in tokens]
        metadata_dict["verb"] = verb
        metadata_dict["verb_index"] = verb_index

        if tags:
            if self.bert_tokenizer is not None:
                new_tags = _convert_tags_to_wordpiece_tags(tags, offsets)
                fields["tags"] = SequenceLabelField(new_tags, text_field)
            else:
                fields["tags"] = SequenceLabelField(tags, text_field)
            metadata_dict["gold_tags"] = tags

        fields["metadata"] = MetadataField(metadata_dict)

        return Instance(fields)
    def text_to_instance(
            self,  # type: ignore
            sentence_tokens: List[str],
            predicates: List[int],
            predicate_index: int,
            constits: List[List[str]] = None,
            parents: List[List[str]] = None) -> Instance:
        """
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(t) for t in sentence_tokens],
                               token_indexers=self._token_indexers)
        verb_field = SequenceLabelField(predicates, text_field)
        predicate_field = IndexField(predicate_index, text_field)

        # Span-based output fields.
        span_starts: List[Field] = []
        span_ends: List[Field] = []
        span_mask: List[int] = [
            1 for _ in range(len(sentence_tokens) * self.max_span_width)
        ]
        span_labels: Optional[List[str]] = [] if constits is not None else None
        parent_labels: Optional[
            List[str]] = [] if parents is not None else None

        for j in range(len(sentence_tokens)):
            for diff in range(self.max_span_width):
                width = diff
                if j - diff < 0:
                    # This is an invalid span.
                    span_mask[j * self.max_span_width + diff] = 0
                    width = j

                span_starts.append(IndexField(j - width, text_field))
                span_ends.append(IndexField(j, text_field))

                if constits is not None:
                    label = constits[j][diff]
                    span_labels.append(label)

                if parents is not None:
                    parent_labels.append(parents[j][diff])

        start_fields = ListField(span_starts)
        end_fields = ListField(span_ends)
        span_mask_fields = SequenceLabelField(span_mask, start_fields)

        fields: Dict[str, Field] = {
            "tokens": text_field,
            "targets": verb_field,
            "span_starts": start_fields,
            "span_ends": end_fields,
            "span_mask": span_mask_fields,
            "target_index": predicate_field
        }

        if constits:
            fields['tags'] = SequenceLabelField(
                span_labels,
                start_fields,
                label_namespace=self.label_namespace)
            fields['parent_tags'] = SequenceLabelField(
                parent_labels,
                start_fields,
                label_namespace=self.parent_label_namespace)
        return Instance(fields)
Example #4
0
    def _process_sentence(
            self, sentence_tokens: List[str],
            constits: Dict[Tuple[int, int], str], verbal_predicates: List[int],
            predicate_argument_labels: List[List[str]]) -> List[Instance]:
        """
        Parameters
        ----------
        sentence_tokens : ``List[str]``, required.
            The tokenised sentence.
        constits : ``Dict[Tuple[int, int], str]]``, required.
        verbal_predicates : ``List[int]``, required.
            The indexes of the verbal predicates in the
            sentence which have an associated annotation.
        predicate_argument_labels : ``List[List[str]]``, required.
            A list of predicate argument labels, one for each verbal_predicate. The
            internal lists are of length: len(sentence).

        Returns
        -------
        A list of Instances.

        """
        default = "*"

        def get_new_label(original: str, newer: str):
            return newer if original == default else "{}|{}".format(
                newer, original)

        constit_matrix = [[default for _ in range(self.max_span_width)]
                          for _ in sentence_tokens]
        for span in constits:
            start, end = span
            diff = end - start
            if diff >= self.max_span_width:
                continue
            # while diff >= self.max_span_width:
            #     old_label = constit_matrix[end][self.max_span_width - 1]
            #     constit_matrix[end][self.max_span_width -
            #                         1] = get_new_label(old_label, constits[span])
            #     end = end - self.max_span_width
            #     diff = end - start
            constit_matrix[end][diff] = get_new_label(
                constit_matrix[end][diff], constits[span])

        tokens = [Token(t) for t in sentence_tokens]
        if not verbal_predicates:
            # Sentence contains no predicates.
            tags = ["O" for _ in sentence_tokens]
            verb_label = [0 for _ in sentence_tokens]
            srl_args = self._convert_bio_into_matrix(tags)
            dummy_verb_index = 0
            return [
                self.text_to_instance(tokens, verb_label, dummy_verb_index,
                                      constit_matrix, srl_args)
            ]
        else:
            instances = []

            for verb_index, tags in zip(verbal_predicates,
                                        predicate_argument_labels):
                verb_label = [0 for _ in sentence_tokens]
                verb_label[verb_index] = 1
                srl_args = self._convert_bio_into_matrix(tags)
                instances.append(
                    self.text_to_instance(tokens, verb_label, verb_index,
                                          constit_matrix, srl_args))
                self.find_overlap(srl_args, constit_matrix)
            return instances
Example #5
0
    def text_to_instance(
            self,  # type: ignore
            sentences: List[List[str]],
            document_id: str,
            sentence_id: int,
            gold_clusters: Optional[List[List[Tuple[int, int]]]] = None,
            user_threshold: Optional[float] = 0.0) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenised words and sentences in the document.
        document_id : ``str``, required.
            A string representing the document ID.
        sentence_id : ``int``, required.
            An int representing the sentence ID.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.
        user_threshold: ``Optional[float]``, optional (default = 0.0)
            approximate % of gold labels to label to hold out as user input.
            EX = 0.5, 0.33, 0.25, 0.125

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full document.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the document text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        flattened_sentences = [
            self._normalize_word(word) for sentence in sentences
            for word in sentence
        ]

        metadata: Dict[str, Any] = {
            "original_text": flattened_sentences,
            "ID": document_id + ";" + str(sentence_id)
        }
        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters
            metadata["num_gold_clusters"] = len(gold_clusters)

        text_field = TextField([Token(word) for word in flattened_sentences],
                               self._token_indexers)

        user_threshold_mod = int(
            1 / user_threshold
        ) if self._simulate_user_inputs and user_threshold > 0 else 0
        cluster_dict = {}
        simulated_user_cluster_dict = {}

        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for i in range(len(cluster)):
                    # use modulo to have a relatively even distribution of user labels across length of document,
                    # (since clusters are sorted)--so user simulated clusters are spread evenly across document
                    if user_threshold_mod == 0 or i % user_threshold_mod != user_threshold_mod - 1:
                        cluster_dict[tuple(cluster[i])] = cluster_id
                    simulated_user_cluster_dict[tuple(cluster[i])] = cluster_id

        # Note simulated_user_cluster_dict encompasses ALL gold labels, including those in cluster_dict
        # Consequently user_labels encompasses all gold labels
        spans: List[Field] = []
        if gold_clusters is not None:
            span_labels: Optional[List[int]] = []
            user_labels: Optional[List[
                int]] = [] if self._simulate_user_inputs and user_threshold > 0 else None
        else:
            span_labels = user_labels = None

        # our must-link and cannot-link constraints, derived from user labels
        # using gold_clusters being None as an indicator of whether we're running training or not
        # TODO: confirm ^^
        must_link: Optional[
            List[int]] = [] if gold_clusters is not None else None
        cannot_link: Optional[
            List[int]] = [] if gold_clusters is not None else None

        sentence_offset = 0
        for sentence in sentences:
            for start, end in enumerate_spans(
                    sentence,
                    offset=sentence_offset,
                    max_span_width=self._max_span_width):
                if span_labels is not None:
                    if (start, end) in cluster_dict:
                        span_labels.append(cluster_dict[(start, end)])
                    else:
                        span_labels.append(-1)
                    if self._simulate_user_inputs and user_threshold > 0:
                        if (start, end) in simulated_user_cluster_dict:
                            user_labels.append(
                                simulated_user_cluster_dict[(start, end)])
                        else:
                            user_labels.append(-1)

                spans.append(SpanField(start, end, text_field))
            sentence_offset += len(sentence)

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {
            "text": text_field,
            "spans": span_field,
            "metadata": metadata_field
        }
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)
            if user_labels is not None:
                fields["user_labels"] = SequenceLabelField(
                    user_labels, span_field)

        return Instance(fields)
Example #6
0
    def text_to_instance(
            self,  # type: ignore
            query: List[str],
            derived_cols: List[Tuple[str, str]],
            derived_tables: List[str],
            prelinked_entities: Dict[str, Dict[str, str]] = None,
            sql: List[str] = None,
            alignment: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        tokens = TextField([Token(t) for t in query], self._token_indexers)
        fields["tokens"] = tokens

        if sql is not None:
            action_sequence, all_actions = self._world.get_action_sequence_and_all_actions(
                query=sql,
                derived_cols=derived_cols,
                derived_tables=derived_tables,
                prelinked_entities=prelinked_entities)
            if action_sequence is None:
                return None

            if alignment is not None:
                # Modify the alignment according to the action sequence
                alignment = AttnSupGrammarBasedWorld.modify_alignment(
                    action_sequence=action_sequence, alignment=alignment)
            else:
                # having a list of NO_ALIGN is basically equivalent to mask all the alignment
                alignment = ['NO_ALIGN'] * len(action_sequence)

        index_fields: List[Field] = []
        production_rule_fields: List[Field] = []

        for production_rule in all_actions:
            nonterminal, _ = production_rule.split(' ->')
            production_rule = ' '.join(production_rule.split(' '))
            field = ProductionRuleField(
                production_rule,
                self._world.is_global_rule(nonterminal),
                nonterminal=nonterminal)
            production_rule_fields.append(field)

        valid_actions_field = ListField(production_rule_fields)
        fields["valid_actions"] = valid_actions_field

        action_map = {
            action.rule: i  # type: ignore
            for i, action in enumerate(valid_actions_field.field_list)
        }

        for production_rule in action_sequence:
            index_fields.append(
                IndexField(action_map[production_rule], valid_actions_field))
        if not action_sequence:
            index_fields = [IndexField(-1, valid_actions_field)]
        # if not action_sequence and re.findall(r"COUNT \( \* \) (?:<|>|<>|=) 0", " ".join(sql)):
        #     index_fields = [IndexField(-2, valid_actions_field)]

        action_sequence_field = ListField(index_fields)
        fields["action_sequence"] = action_sequence_field

        alignment_index_fields: List[IndexField] = []
        tmp_tokens_as_strings = [t.text for t in tokens]
        for aligned_token in alignment:
            try:
                aligned_token_index = int(
                    tmp_tokens_as_strings.index(aligned_token))
                alignment_index_fields.append(
                    IndexField(aligned_token_index, tokens))
            except ValueError as e:
                # a special "no alignment" index
                alignment_index_fields.append(
                    IndexField(-1, tokens.empty_field()))
        fields["alignment_sequence"] = ListField(alignment_index_fields)

        return Instance(fields)
 def _read_tokens_from_json_list(json_list) -> List[Token]:
     return [
         Token(text=json_obj['text'], lemma=json_obj['lemma'])
         for json_obj in json_list
     ]
Example #8
0
    def text_to_instance(
            self,  # type: ignore
            rule_text: str,
            question: str,
            scenario: str,
            history: List[Dict[str, str]],
            utterance_id: str = None,
            tree_id: str = None,
            source_url: str = None,
            answer: str = None,
            evidence: List[Dict[str, str]] = None) -> Optional[Instance]:
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """

        # For CopyNet Model
        source_string = rule_text + ' [SEP]'
        target_string = answer

        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        # tokenized_source.append(Token(END_SYMBOL)) ' @@SEP@@' acts as end symbol
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source[1:-1], self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]]
        }
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        # For Bert model
        passage_text1 = rule_text + ' [SEP]'
        question_text1 = question

        passage_text2 = rule_text + ' [SEP]'
        question_text2 = scenario

        bert_input1 = passage_text1 + ' ' + question_text1
        bert_input2 = passage_text2 + ' ' + question_text2

        bert_input_tokens1 = self.get_tokens_with_history_encoding(
            bert_input1, history)
        bert_input_tokens2 = self._bert_tokenizer.tokenize(bert_input2)
        bert_input_tokens1.insert(0, Token(START_SYMBOL))
        bert_input_tokens2.insert(0, Token(START_SYMBOL))
        fields_dict['bert_input1'] = TextField(bert_input_tokens1,
                                               self._bert_token_indexers)
        fields_dict['bert_input2'] = TextField(bert_input_tokens2,
                                               self._bert_token_indexers)
        meta_fields['passage_tokens1'] = self._bert_tokenizer.tokenize(
            passage_text1)
        meta_fields['passage_tokens2'] = self._bert_tokenizer.tokenize(
            passage_text2)

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source) - 2]
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ) - 2:]
            fields_dict["target_token_ids"] = ArrayField(
                np.array(target_token_ids))

            action = 'More' if answer not in ['Yes', 'No', 'Irrelevant'
                                              ] else answer
            fields_dict['label'] = LabelField(action)
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        meta_fields['rule_text'] = rule_text
        meta_fields['question'] = question
        meta_fields['scenario'] = scenario
        meta_fields['history'] = history
        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
Example #9
0
    def text_to_instance(
        self,  # type: ignore
        tokens: List[str],
        pos_tags: List[str] = None,
        gold_tree: Tree = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        # Parameters

        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        pos_tags : ``List[str]``, optional, (default = None).
            The POS tags for the words in the sentence.
        gold_tree : ``Tree``, optional (default = None).
            The gold parse tree to create span labels from.

        # Returns

        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            pos_tags : ``SequenceLabelField``
                The POS tags of the words in the sentence.
                Only returned if ``use_pos_tags`` is ``True``
            spans : ``ListField[SpanField]``
                A ListField containing all possible subspans of the
                sentence.
            span_labels : ``SequenceLabelField``, optional.
                The constituency tags for each of the possible spans, with
                respect to a gold parse tree. If a span is not contained
                within the tree, a span will have a ``NO-LABEL`` label.
            gold_tree : ``MetadataField(Tree)``
                The gold NLTK parse tree for use in evaluation.
        """

        if self._convert_parentheses:
            tokens = [PTB_PARENTHESES.get(token, token) for token in tokens]
        text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        pos_namespace = self._label_namespace_prefix + self._pos_label_namespace
        if self._use_pos_tags and pos_tags is not None:
            pos_tag_field = SequenceLabelField(pos_tags, text_field, label_namespace=pos_namespace)
            fields["pos_tags"] = pos_tag_field
        elif self._use_pos_tags:
            raise ConfigurationError(
                "use_pos_tags was set to True but no gold pos"
                " tags were passed to the dataset reader."
            )
        spans: List[Field] = []
        gold_labels = []

        if gold_tree is not None:
            gold_spans: Dict[Tuple[int, int], str] = {}
            self._get_gold_spans(gold_tree, 0, gold_spans)

        else:
            gold_spans = None
        for start, end in enumerate_spans(tokens):
            spans.append(SpanField(start, end, text_field))

            if gold_spans is not None:
                gold_labels.append(gold_spans.get((start, end), "NO-LABEL"))

        metadata = {"tokens": tokens}
        if gold_tree:
            metadata["gold_tree"] = gold_tree
        if self._use_pos_tags:
            metadata["pos_tags"] = pos_tags

        fields["metadata"] = MetadataField(metadata)

        span_list_field: ListField = ListField(spans)
        fields["spans"] = span_list_field
        if gold_tree is not None:
            fields["span_labels"] = SequenceLabelField(
                gold_labels,
                span_list_field,
                label_namespace=self._label_namespace_prefix + "labels",
            )
        return Instance(fields)
Example #10
0
    def text_to_instance(
            self,
            source_string: str,
            target_string: str = None) -> Instance:  # type: ignore
        """
        Turn raw source string and target string into an `Instance`.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional

        Returns
        -------
        Instance
            An Instance containing at least the following fields:

            - `source_tokens`: a `TextField` containing the tokenized source sentence,
               including the `START_SYMBOL` and `END_SYMBOL`.
               This will result in a tensor of shape `(batch_size, source_length)`.

            - `source_token_ids`: an `ArrayField` of size `(batch_size, trimmed_source_length)`
              that contains an ID for each token in the source sentence. Tokens that
              match at the lowercase level will share the same ID. If `target_tokens`
              is passed as well, these IDs will also correspond to the `target_token_ids`
              field, i.e. any tokens that match at the lowercase level in both
              the source and target sentences will share the same ID. Note that these IDs
              have no correlation with the token indices from the corresponding
              vocabulary namespaces.

            - `source_to_target`: a `CopyMapField` that keeps track of the index
              of the target token that matches each token in the source sentence.
              When there is no matching target token, the OOV index is used.
              This will result in a tensor of shape `(batch_size, trimmed_source_length)`.

            - `metadata`: a `MetadataField` which contains the source tokens and
              potentially target tokens as lists of strings.

            When `target_string` is passed, the instance will also contain these fields:

            - `target_tokens`: a `TextField` containing the tokenized target sentence,
              including the `START_SYMBOL` and `END_SYMBOL`. This will result in
              a tensor of shape `(batch_size, target_length)`.

            - `target_token_ids`: an `ArrayField` of size `(batch_size, target_length)`.
              This is calculated in the same way as `source_token_ids`.

        Notes
        -----
        By `source_length` we are referring to the number of tokens in the source
        sentence including the `START_SYMBOL` and `END_SYMBOL`, while
        `trimmed_source_length` refers to the number of tokens in the source sentence
        *excluding* the `START_SYMBOL` and `END_SYMBOL`, i.e.
        `trimmed_source_length = source_length - 2`.

        On the other hand, `target_length` is the number of tokens in the target sentence
        *including* the `START_SYMBOL` and `END_SYMBOL`.

        In the context where there is a `batch_size` dimension, the above refer
        to the maximum of their individual values across the batch.
        """
        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = CopyMapField(tokenized_source[1:-1],
                                              self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]]
        }
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source) - 2]
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ) - 2:]
            fields_dict["target_token_ids"] = ArrayField(
                np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
Example #11
0
    def text_to_instance(self, text: str, 
                         targets: Optional[List[str]] = None,
                         target_sentiments: Optional[List[Union[str, int]]] = None,
                         spans: Optional[List[List[int]]] = None,
                         categories: Optional[List[str]] = None,
                         category_sentiments: Optional[List[Union[str, int]]] = None,
                         **kwargs) -> Instance:
        '''
        The original text, text tokens as well as the targets and target 
        tokens are stored in the MetadataField.

        :NOTE: At least targets and/or categories must be present.
        :NOTE: That the left and right contexts returned in the instance are 
               a List of a List of tokens. A list for each Target.

        :param text: The text that contains the target(s) and/or categories.
        :param targets: The targets that are within the text
        :param target_sentiments: The sentiment of the targets. To be used if 
                                  training the classifier
        :param spans: The spans that represent the character offsets for each 
                      of the targets given in the targets list.
        :param categories: The categories that are within the text
        :param category_sentiments: The sentiment of the categories
        :returns: An Instance object with all of the above encoded for a
                  PyTorch model.
        :raises ValueError: If either targets and categories are both None
        :raises ValueError: If `self._target_sequences` is True and the passed 
                            `spans` argument is None.
        :raises ValueError: If `self._left_right_contexts` is True and the 
                            passed `spans` argument is None.
        '''
        if targets is None and categories is None:
            raise ValueError('Either targets or categories must be given if you '
                             'want to be predict the sentiment of a target '
                             'or a category')

        instance_fields: Dict[str, Field] = {}
        

        # Metadata field
        metadata_dict = {}

        if targets is not None:
            # need to change this so that it takes into account the case where 
            # the positions are True but not the target sequences.
            if self._target_sequences or self._position_embeddings or self._position_weights:
                if spans is None:
                    raise ValueError('To create target sequences requires `spans`')
                spans = [Span(span[0], span[1]) for span in spans]
                target_text_object = TargetText(text=text, spans=spans, 
                                                targets=targets, text_id='anything')
                target_text_object.force_targets()
                text = target_text_object['text']
                allen_tokens = self._tokenizer.tokenize(text)
                tokens = [x.text for x in allen_tokens]
                target_text_object['tokenized_text'] = tokens
                target_text_object.sequence_labels(per_target=True)
                target_sequences = target_text_object['sequence_labels']
                # Need to add the target sequences to the instances
                in_label = {'B', 'I'}
                number_targets = len(targets)
                all_target_tokens: List[List[Token]] = [[] for _ in range(number_targets)]
                target_sequence_fields = []
                target_indicators: List[List[int]] = []
                for target_index in range(number_targets):
                    one_values = []
                    target_ones = [0] * len(allen_tokens)
                    for token_index, token in enumerate(allen_tokens):
                        target_sequence_value = target_sequences[target_index][token_index]
                        in_target = 1 if target_sequence_value in in_label else 0
                        if in_target:
                            all_target_tokens[target_index].append(allen_tokens[token_index])
                            one_value_list = [0] * len(allen_tokens)
                            one_value_list[token_index] = 1
                            one_values.append(one_value_list)
                            target_ones[token_index] = 1
                    one_values = np.array(one_values)
                    target_sequence_fields.append(ArrayField(one_values, dtype=np.int32))
                    target_indicators.append(target_ones)
                if self._position_embeddings:
                    target_distances = self._target_indicators_to_distances(target_indicators, 
                                                                            max_distance=self._max_position_distance, 
                                                                            as_string=True)
                    target_text_distances = []
                    for target_distance in target_distances:
                        token_distances = [Token(distance) for distance in target_distance]
                        token_distances = TextField(token_distances, self._position_indexers)
                        target_text_distances.append(token_distances)
                    instance_fields['position_embeddings'] = ListField(target_text_distances)
                if self._position_weights:
                    target_distances = self._target_indicators_to_distances(target_indicators, 
                                                                            max_distance=self._max_position_distance, 
                                                                            as_string=False)
                    target_distances = np.array(target_distances)
                    instance_fields['position_weights'] = ArrayField(target_distances, 
                                                                     dtype=np.int32)
                if self._target_sequences:
                    instance_fields['target_sequences'] = ListField(target_sequence_fields)
                instance_fields['tokens'] = TextField(allen_tokens, self._token_indexers)
                metadata_dict['text words'] = tokens
                metadata_dict['text'] = text
                # update target variable as the targets could have changed due 
                # to the force_targets function
                targets = target_text_object['targets']
            else:
                all_target_tokens = [self._tokenizer.tokenize(target) 
                                     for target in targets]
            target_fields = [TextField(target_tokens, self._token_indexers)  
                            for target_tokens in all_target_tokens]
            target_fields = ListField(target_fields)
            instance_fields['targets'] = target_fields
            # Add the targets and the tokenised targets to the metadata
            metadata_dict['targets'] = [target for target in targets]
            metadata_dict['target words'] = [[x.text for x in target_tokens] 
                                             for target_tokens in all_target_tokens]

            # Target sentiment if it exists
            if target_sentiments is not None:
                target_sentiments_field = SequenceLabelField(target_sentiments, 
                                                             target_fields,
                                                             label_namespace='target-sentiment-labels')
                instance_fields['target_sentiments'] = target_sentiments_field

        if categories is not None and self._use_categories:
            category_fields = TextField([Token(category) for category in categories], 
                                        self._token_indexers)
            instance_fields['categories'] = category_fields
            # Category sentiment if it exists
            if category_sentiments is not None:
                category_sentiments_field = SequenceLabelField(category_sentiments, 
                                                               category_fields,
                                                               label_namespace='category-sentiment-labels')
                instance_fields['category_sentiments'] = category_sentiments_field
            # Add the categories to the metadata
            metadata_dict['categories'] = [category for category in categories]

        if 'tokens' not in instance_fields:
            tokens = self._tokenizer.tokenize(text)
            instance_fields['tokens'] = TextField(tokens, self._token_indexers)
            metadata_dict['text'] = text
            metadata_dict['text words'] = [x.text for x in tokens]

        # If required processes the left and right contexts
        left_contexts = None
        right_contexts = None
        if self._left_right_contexts:
            if spans is None:
                raise ValueError('To create left, right, target contexts requires'
                                 ' the `spans` of the targets which is None')
            spans = [Span(span[0], span[1]) for span in spans]
            target_text_object = TargetText(text=text, spans=spans, 
                                            targets=targets, text_id='anything')
            # left, right, and target contexts for each target in the 
            # the text
            left_right_targets = target_text_object.left_right_target_contexts(incl_target=self._incl_target)
            left_contexts: List[str] = []
            right_contexts: List[str] = []
            for left_right_target in left_right_targets:
                left, right, _ = left_right_target
                left_contexts.append(left)
                if self._reverse_right_context:
                    right_tokens = self._tokenizer.tokenize(right)
                    reversed_right_tokens = []
                    for token in reversed(right_tokens):
                        reversed_right_tokens.append(token.text)
                    right = ' '.join(reversed_right_tokens)
                right_contexts.append(right)
        
        if left_contexts is not None:
            left_field = self._add_context_field(left_contexts)
            instance_fields["left_contexts"] = left_field
        if right_contexts is not None:
            right_field = self._add_context_field(right_contexts)
            instance_fields["right_contexts"] = right_field

        instance_fields["metadata"] = MetadataField(metadata_dict)
        
        return Instance(instance_fields)
Example #12
0
 def text_to_instance(self, data: Dict[str, Any]) -> Instance:  # pylint: disable=arguments-differ
     # Flatten and pad tokens
     tokens = data['tokens']
     tokens = [Token(x) for x in tokens]
     fields = {'tokens': TextField(tokens, self._token_indexers)}
     return Instance(fields)
Example #13
0
    def text_to_instance(self, data: Dict[str, Any]) -> Instance:  # pylint: disable=arguments-differ
        # Flatten and pad tokens
        tokens = _flatten(data['tokens'])
        tokens = ['@@START@@', *tokens, '@@END@@']
        source = [Token(x) for x in tokens[:-1]]
        target = [Token(x) for x in tokens[1:]]
        fields = {
            'source': TextField(source, self._token_indexers),
            'target': TextField(target, self._token_indexers)
        }

        # Process annotations
        if 'annotations' in data:

            # We maintain a "shortlist" of observed entities, that is used for baseline models
            # that only select entities from the set that appear in the document (as opposed to
            # the set of all possible entities).
            shortlist = [DEFAULT_PADDING_TOKEN]
            reverse_shortlist = {DEFAULT_PADDING_TOKEN: 0}

            entity_ids = [DEFAULT_PADDING_TOKEN] * len(target)
            shortlist_inds = np.zeros(shape=(len(target, )))
            alias_copy_inds = np.zeros(shape=(len(target), ))
            alias_tokens = [TextField([], self._token_indexers)] * len(target)
            alias_inds: List[List[int]] = [[]] * len(target)
            max_len = 0

            # Process annotations
            for annotation in data['annotations']:

                # Obtain the entity identifier for the annotated span
                entity_id = annotation['id']
                alias = annotation['alias']
                alias_map = {
                    token: i + 1
                    for i, token in enumerate(set(alias))
                }

                # If neccessary, update the shortlist. Obtain the index of the entity identifier in
                # the shortlist.
                if entity_id not in reverse_shortlist:
                    reverse_shortlist[entity_id] = len(reverse_shortlist)
                    shortlist.append(entity_id)
                shortlist_ind = reverse_shortlist[entity_id]

                # Update the outputs
                for i in range(*annotation['span']):
                    # Note: +1 offset to account for start token.
                    if tokens[i + 1] not in alias_map:
                        continue
                    else:
                        entity_ids[i] = entity_id
                        shortlist_inds[i] = shortlist_ind
                        alias_copy_inds[i] = alias_map[tokens[i + 1]]
                        alias_inds[i] = [alias_map[token] for token in alias]
                        alias_tokens[i] = TextField([Token(x) for x in alias],
                                                    self._token_indexers)
                        max_len = max(max_len, len(alias))

            # Make alias_inds into a numpy array
            alias_ind_array = np.zeros((len(target), max_len))
            for i, arr in enumerate(alias_inds):
                for j, ind in enumerate(arr):
                    alias_ind_array[i, j] = ind

            fields['entity_ids'] = TextField(
                [Token(x) for x in entity_ids],
                token_indexers=self._entity_indexers)
            fields['alias_copy_inds'] = SequentialArrayField(alias_copy_inds,
                                                             dtype=np.int64)
            fields['shortlist'] = TextField(
                [Token(x) for x in shortlist],
                token_indexers=self._entity_indexers)
            fields['shortlist_inds'] = SequentialArrayField(shortlist_inds,
                                                            dtype=np.int64)
            fields['alias_tokens'] = ListField(alias_tokens)
            fields['alias_inds'] = SequentialArrayField(alias_ind_array,
                                                        dtype=np.int64)

        return Instance(fields)
Example #14
0
def _tokenize(iterable: Iterable[str]):
    return [Token(x) for x in iterable]
Example #15
0
 def _read(self, file_path: str) -> Iterator[Instance]:
     with open(file_path) as f:
         for line in f:
             pairs = line.strip().split()
             sentence, tags = zip(*(pair.split("###") for pair in pairs))
             yield self.text_to_instance([Token(word) for word in sentence], tags)
Example #16
0
 def tokenizer(self, text):
     text = [
         Token(mrph.midasi)
         for mrph in self.jumanpp.analysis(text).mrph_list()
     ][0:self.max_tokens]
     return text
Example #17
0
 def prepare_text(text, max_tokens):
     tokens = self._tokenizer.tokenize(text)[:max_tokens]
     tokens.insert(0, Token(START_SYMBOL))
     tokens.append(Token(END_SYMBOL))
     return tokens
Example #18
0
    def text_to_instance(
        self,  # type: ignore
        tokens: List[str],
        ccg_categories: List[str] = None,
        original_pos_tags: List[str] = None,
        modified_pos_tags: List[str] = None,
        predicate_arg_categories: List[str] = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        ccg_categories : ``List[str]``, optional, (default = None).
            The CCG categories for the words in the sentence. (e.g. N/N)
        original_pos_tags : ``List[str]``, optional, (default = None).
            The tag assigned to the word in the Penn Treebank.
        modified_pos_tags : ``List[str]``, optional, (default = None).
            The POS tag might have changed during the translation to CCG.
        predicate_arg_categories : ``List[str]``, optional, (default = None).
            Encodes the word-word dependencies in the underlying predicate-
            argument structure.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            tags : ``SequenceLabelField``
                The tags corresponding to the ``tag_label`` constructor argument.
            feature_label_tags : ``SequenceLabelField``
                Tags corresponding to each feature_label (if any) specified in the
                ``feature_labels`` constructor argument.
        """

        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        # Add "feature labels" to instance
        if "ccg" in self.feature_labels:
            if ccg_categories is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use CCG categories as "
                    "features. Pass them to text_to_instance.")
            fields["ccg_tags"] = SequenceLabelField(ccg_categories, text_field,
                                                    "ccg_tags")
        if "original_pos" in self.feature_labels:
            if original_pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use original POS tags as "
                    "features. Pass them to text_to_instance.")
            fields["original_pos_tags"] = SequenceLabelField(
                original_pos_tags, text_field, "original_pos_tags")
        if "modified_pos" in self.feature_labels:
            if modified_pos_tags is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use modified POS tags as "
                    " features. Pass them to text_to_instance.")
            fields["modified_pos_tags"] = SequenceLabelField(
                modified_pos_tags, text_field, "modified_pos_tags")
        if "predicate_arg" in self.feature_labels:
            if predicate_arg_categories is None:
                raise ConfigurationError(
                    "Dataset reader was specified to use predicate arg tags as "
                    " features. Pass them to text_to_instance.")
            fields["predicate_arg_tags"] = SequenceLabelField(
                predicate_arg_categories, text_field, "predicate_arg_tags")

        # Add "tag label" to instance
        if self.tag_label == "ccg" and ccg_categories is not None:
            fields["tags"] = SequenceLabelField(ccg_categories, text_field,
                                                self.label_namespace)
        elif self.tag_label == "original_pos" and original_pos_tags is not None:
            fields["tags"] = SequenceLabelField(original_pos_tags, text_field,
                                                self.label_namespace)
        elif self.tag_label == "modified_pos" and modified_pos_tags is not None:
            fields["tags"] = SequenceLabelField(modified_pos_tags, text_field,
                                                self.label_namespace)
        elif self.tag_label == "predicate_arg" and predicate_arg_categories is not None:
            fields["tags"] = SequenceLabelField(predicate_arg_categories,
                                                text_field,
                                                self.label_namespace)

        return Instance(fields)
Example #19
0
    def text_to_instance(self, rule_text, question, scenario, history, answer=None, evidence=None) -> Instance:  # type: ignore
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """
        # pylint: disable=arguments-differ

        if answer and answer in ['Yes', 'No', 'Irrelevant']:
            return None
        target_string = answer

        if self.train_using_gold and answer is not None: # i.e. during training and validation
            predicted_label = answer if answer in ['Yes', 'No', 'Irrelevant'] else 'More'
            predicted_span_ixs = self.dataset_reader.find_lcs(rule_text, answer, self._source_tokenizer.tokenize)
            if predicted_span_ixs is None:
                return None
            else:
                rule_offsets = [(token.idx, token.idx + len(token.text)) for token in self._source_tokenizer.tokenize(rule_text)]
                predicted_span = rule_text[rule_offsets[predicted_span_ixs[0]][0]: rule_offsets[predicted_span_ixs[1]][1]]
        else:
            predicted_span, predicted_label = self.get_prediction(rule_text, question, scenario, history)

        if self.add_rule:
            if self.embed_span:
                source_string = self.get_embedded_span(rule_text, predicted_span)
            else:
                source_string = rule_text + ' @pss@ ' + predicted_span + ' @pse@'
        else:
            source_string = predicted_span
        if self.add_question:
            source_string += ' @qs@ ' + question + ' @qe'
        if self.add_followup_ques:
            for follow_up_qna in history:
                source_string += ' @fs@ ' + follow_up_qna['follow_up_question'] + ' @fe'

        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace)

        meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]}
        fields_dict = {
                "source_tokens": source_field,
                "source_to_target": source_to_target_field,
        }

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]]
            source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] +
                                                              tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2]
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:]
            fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))

        meta_fields['label'] = predicted_label
        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
Example #20
0
    def attack_from_json(self,
                         inputs: JsonDict = None,
                         input_field_to_attack: str = 'tokens',
                         grad_input_field: str = 'grad_input_1',
                         ignore_tokens: List[str] = None) -> JsonDict:
        """
        Replaces one token at a time from the input until the model's prediction changes.
        ``input_field_to_attack`` is for example ``tokens``, it says what the input field is
        called.  ``grad_input_field`` is for example ``grad_input_1``, which is a key into a grads
        dictionary.

        The method computes the gradient w.r.t. the tokens, finds the token with the maximum
        gradient (by L2 norm), and replaces it with another token based on the first-order Taylor
        approximation of the loss.  This process is iteratively repeated until the prediction
        changes.  Once a token is replaced, it is not flipped again.
        """
        if self.token_embedding is None:
            self.initialize()
        ignore_tokens = ["@@NULL@@", '.', ',', ';', '!', '?'] if ignore_tokens is None else ignore_tokens
        original_instances = self.predictor.json_to_labeled_instances(inputs)
        original_text_field: TextField = original_instances[0][input_field_to_attack]  # type: ignore
        original_tokens = deepcopy(original_text_field.tokens)
        final_tokens = []
        for current_instance in original_instances:
            # Gets a list of the fields that we want to check to see if they change.
            fields_to_compare = utils.get_fields_to_compare(inputs, current_instance, input_field_to_attack)
            current_text_field: TextField = current_instance[input_field_to_attack]  # type: ignore
            current_tokens = current_text_field.tokens
            grads, outputs = self.predictor.get_gradients([current_instance])

            # ignore any token that is in the ignore_tokens list by setting the token to already flipped
            flipped: List[int] = []
            for index, token in enumerate(current_tokens):
                if token.text in ignore_tokens:
                    flipped.append(index)
            while True:
                # Compute L2 norm of all grads.
                grad = grads[grad_input_field]
                grads_magnitude = [g.dot(g) for g in grad]

                # only flip a token once
                for index in flipped:
                    grads_magnitude[index] = -1

                # we flip the token with highest gradient norm
                index_of_token_to_flip = numpy.argmax(grads_magnitude)
                # when we have already flipped all the tokens once
                if grads_magnitude[index_of_token_to_flip] == -1:
                    break
                flipped.append(index_of_token_to_flip)

                # Get new token using taylor approximation
                input_tokens = current_text_field._indexed_tokens["tokens"]
                original_id_of_token_to_flip = input_tokens[index_of_token_to_flip]
                new_id_of_flipped_token = _first_order_taylor(grad[index_of_token_to_flip],
                                                              self.token_embedding.weight,  # type: ignore
                                                              original_id_of_token_to_flip)
                # flip token
                new_token = Token(self.vocab._index_to_token["tokens"][new_id_of_flipped_token])  # type: ignore
                current_text_field.tokens[index_of_token_to_flip] = new_token
                current_instance.indexed = False

                # Get model predictions on current_instance, and then label the instances
                grads, outputs = self.predictor.get_gradients([current_instance])  # predictions
                for key, output in outputs.items():
                    if isinstance(output, torch.Tensor):
                        outputs[key] = output.detach().cpu().numpy().squeeze()
                    elif isinstance(output, list):
                        outputs[key] = output[0]

                # add labels to current_instances
                current_instance_labeled = self.predictor.predictions_to_labeled_instances(current_instance,
                                                                                           outputs)[0]
                # if the prediction has changed, then stop
                if any(current_instance_labeled[field] != fields_to_compare[field] for field in fields_to_compare):
                    break

            final_tokens.append(current_tokens)
        return sanitize({"final": final_tokens,
                         "original": original_tokens,
                         "outputs": outputs})
Example #21
0
    def text_to_instance(self,  # type: ignore
                         document_id: str,
                         part_number: str,
                         sentences: List[List[str]],
                         gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        document_id: ``str``, required.
            The id of the document.
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenised words and sentences in the document.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            text : ``TextField``
                The text of the full document.
            spans : ``ListField[SpanField]``
                A ListField containing the spans represented as ``SpanFields``
                with respect to the document text.
            span_labels : ``SequenceLabelField``, optional
                The id of the cluster which each possible span belongs to, or -1 if it does
                 not belong to a cluster. As these labels have variable length (it depends on
                 how many spans we are considering), we represent this a as a ``SequenceLabelField``
                 with respect to the ``spans ``ListField``.
        """
        flattened_sentences = [self._normalize_word(word)
                               for sentence in sentences
                               for word in sentence]

        metadata: Dict[str, Any] = {
            "document_id": document_id,
            "part_number": part_number,
            "original_text": flattened_sentences,
        }

        if gold_clusters is not None:
            metadata["clusters"] = gold_clusters

        text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers)

        cluster_dict = {}
        if gold_clusters is not None:
            for cluster_id, cluster in enumerate(gold_clusters):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        spans: List[Field] = []
        span_labels: Optional[List[int]] = [] if gold_clusters is not None else None

        sentence_offset = 0
        for sentence in sentences:
            for start, end in enumerate_spans(sentence,
                                              offset=sentence_offset,
                                              max_span_width=self._max_span_width):
                if span_labels is not None:
                    if (start, end) in cluster_dict:
                        span_labels.append(cluster_dict[(start, end)])
                    else:
                        span_labels.append(-1)

                spans.append(SpanField(start, end, text_field))
            sentence_offset += len(sentence)

        span_field = ListField(spans)
        metadata_field = MetadataField(metadata)

        fields: Dict[str, Field] = {"text": text_field,
                                    "spans": span_field,
                                    "metadata": metadata_field}
        if span_labels is not None:
            fields["span_labels"] = SequenceLabelField(span_labels, span_field)

        return Instance(fields)
Example #22
0
 def tokenize(self, text: str) -> List[Token]:
     return [Token(token) for token in self.tokenizer.tokenize(text)]
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        if file_path.endswith("zip"):
            archive = zipfile.ZipFile(file_path, "r")
            data_file = archive.open(os.path.basename(file_path)[:-4])
        else:
            data_file = open(file_path, "r")

        logger.info("Reading instances from lines in file at: %s", file_path)

        dialogs = json.load(data_file)

        for dial_name in dialogs:
            dialog = dialogs[dial_name]["log"]
            context_tokens_list = []
            for i, turn in enumerate(dialog):
                if self._agent and self._agent == "user" and i % 2 != 1:
                    continue
                if self._agent and self._agent == "system" and i % 2 != 0:
                    continue

                tokens = turn["text"].split()

                dialog_act = {}
                for dacts in turn["span_info"]:
                    if dacts[0] not in dialog_act:
                        dialog_act[dacts[0]] = []
                    dialog_act[dacts[0]].append(
                        [dacts[1], " ".join(tokens[dacts[3]:dacts[4] + 1])])

                spans = turn["span_info"]
                tags = []
                for j in range(len(tokens)):
                    for span in spans:
                        if j == span[3]:
                            tags.append("B-" + span[0] + "+" + span[1])
                            break
                        if j > span[3] and j <= span[4]:
                            tags.append("I-" + span[0] + "+" + span[1])
                            break
                    else:
                        tags.append("O")

                intents = []
                for dacts in turn["dialog_act"]:
                    for dact in turn["dialog_act"][dacts]:
                        if dacts not in dialog_act or dact[0] not in [
                                sv[0] for sv in dialog_act[dacts]
                        ]:
                            if dact[1] in [
                                    "none", "?", "yes", "no", "do nt care",
                                    "do n't care"
                            ]:
                                intents.append(dacts + "+" + dact[0] + "*" +
                                               dact[1])

                for dacts in turn["dialog_act"]:
                    for dact in turn["dialog_act"][dacts]:
                        if dacts not in dialog_act:
                            dialog_act[dacts] = turn["dialog_act"][dacts]
                            break
                        elif dact[0] not in [
                                sv[0] for sv in dialog_act[dacts]
                        ]:
                            dialog_act[dacts].append(dact)

                num_context = random.randint(
                    0, self._context_size
                ) if self._random_context_size else self._context_size
                if len(context_tokens_list) > 0 and num_context > 0:
                    wrapped_context_tokens = [
                        Token(token) for context_tokens in
                        context_tokens_list[-num_context:]
                        for token in context_tokens
                    ]
                else:
                    wrapped_context_tokens = [Token("SENT_END")]
                wrapped_tokens = [Token(token) for token in tokens]
                context_tokens_list.append(tokens + ["SENT_END"])

                yield self.text_to_instance(wrapped_context_tokens,
                                            wrapped_tokens, tags, intents,
                                            dialog_act)
Example #24
0
    def text_to_instance(
            self,
            question_text: str,
            passage_text: str,
            passage_tokens: List[Token],
            passage_spans: List[Tuple[int, int]],
            numbers_in_passage: List[Any],
            number_words: List[str],
            number_indices: List[int],
            number_len: List[int],
            question_id: str = None,
            passage_id: str = None,
            answer_annotations: List[Dict] = None,
            count_gold_spans_text: List[str] = None) -> Union[Instance, None]:
        # Tokenize question and passage
        question_tokens = self.tokenizer.tokenize(question_text)
        qlen = len(question_tokens)
        plen = len(passage_tokens)

        question_passage_tokens = [Token('[CLS]')] + question_tokens + [
            Token('[SEP]')
        ] + passage_tokens
        if len(question_passage_tokens) > self.max_pieces - 1:
            question_passage_tokens = question_passage_tokens[:self.
                                                              max_pieces - 1]
            passage_tokens = passage_tokens[:self.max_pieces - qlen - 3]
            plen = len(passage_tokens)
            number_indices, number_len, numbers_in_passage = \
                clipped_passage_num(number_indices, number_len, numbers_in_passage, plen)

        question_passage_tokens += [Token('[SEP]')]
        number_indices = [index + qlen + 2 for index in number_indices] + [-1]
        # Not done in-place so they won't change the numbers saved for the passage
        number_len = number_len + [1]
        numbers_in_passage = numbers_in_passage + [0]
        number_tokens = [Token(str(number)) for number in numbers_in_passage]
        extra_number_tokens = [Token(str(num)) for num in self.extra_numbers]

        mask_indices = [0, qlen + 1, len(question_passage_tokens) - 1]

        fields: Dict[str, Field] = {}

        # Add feature fields
        question_passage_field = TextField(question_passage_tokens,
                                           self.token_indexers)
        fields["question_passage"] = question_passage_field

        number_token_indices = \
            [ArrayField(np.arange(start_ind, start_ind + number_len[i]), padding_value=-1)
             for i, start_ind in enumerate(number_indices)]
        fields["number_indices"] = ListField(number_token_indices)
        numbers_in_passage_field = TextField(number_tokens,
                                             self.token_indexers)
        extra_numbers_field = TextField(extra_number_tokens,
                                        self.token_indexers)
        all_numbers_field = TextField(extra_number_tokens + number_tokens,
                                      self.token_indexers)
        mask_index_fields: List[Field] = [
            IndexField(index, question_passage_field) for index in mask_indices
        ]
        fields["mask_indices"] = ListField(mask_index_fields)

        # Compile question, passage, answer metadata
        metadata = {
            "original_passage": passage_text,
            "original_question": question_text,
            "original_numbers": numbers_in_passage,
            "original_number_words": number_words,
            "extra_numbers": self.extra_numbers,
            "passage_tokens": passage_tokens,
            "question_tokens": question_tokens,
            "question_passage_tokens": question_passage_tokens,
            "passage_id": passage_id,
            "question_id": question_id
        }

        if self.extract_spans:
            metadata["passage_spans"] = passage_spans

        if count_gold_spans_text is not None:
            metadata["count_gold_spans_text"] = count_gold_spans_text

        if answer_annotations:
            for annotation in answer_annotations:
                tokenized_spans = [[
                    token.text for token in self.tokenizer.tokenize(answer)
                ] for answer in annotation['spans']]
                annotation['spans'] = [
                    tokenlist_to_passage(token_list)
                    for token_list in tokenized_spans
                ]

            # Get answer type, answer text, tokenize
            answer_type, answer_texts = DropReader.extract_answer_info_from_annotation(
                answer_annotations[0])
            tokenized_answer_texts = []
            num_spans = min(len(answer_texts), self.max_spans)
            for answer_text in answer_texts:
                answer_tokens = self.tokenizer.tokenize(answer_text)
                tokenized_answer_texts.append(' '.join(
                    token.text for token in answer_tokens))

            metadata["answer_annotations"] = answer_annotations
            metadata["answer_texts"] = answer_texts
            metadata["answer_tokens"] = tokenized_answer_texts

            # Find answer text in question and passage
            valid_question_spans = DropReader.find_valid_spans(
                question_tokens, tokenized_answer_texts)
            for span_ind, span in enumerate(valid_question_spans):
                valid_question_spans[span_ind] = (span[0] + 1, span[1] + 1)
            valid_passage_spans = DropReader.find_valid_spans(
                passage_tokens, tokenized_answer_texts)
            for span_ind, span in enumerate(valid_passage_spans):
                valid_passage_spans[span_ind] = (span[0] + qlen + 2,
                                                 span[1] + qlen + 2)

            # Get target numbers
            target_numbers = []
            for answer_text in answer_texts:
                number = self.word_to_num(answer_text)
                if number is not None:
                    target_numbers.append(number)

            # Get possible ways to arrive at target numbers with add/sub

            valid_expressions: List[List[int]] = []
            exp_strings = None
            if answer_type in ["number", "date"]:
                if self.exp_search == 'full':
                    expressions = get_full_exp(
                        list(enumerate(self.extra_numbers +
                                       numbers_in_passage)), target_numbers,
                        self.operations, self.op_dict, self.max_depth)
                    zipped = list(zip(*expressions))
                    if zipped:
                        valid_expressions = list(zipped[0])
                        exp_strings = list(zipped[1])
                elif self.exp_search == 'add_sub':
                    valid_expressions = \
                        DropReader.find_valid_add_sub_expressions(self.extra_numbers + numbers_in_passage,
                                                                  target_numbers,
                                                                  self.max_numbers_expression)
                elif self.exp_search == 'template':
                    valid_expressions, exp_strings = \
                        get_template_exp(self.extra_numbers + numbers_in_passage,
                                         target_numbers,
                                         self.templates,
                                         self.template_strings)
                    exp_strings = sum(exp_strings, [])

            # Get possible ways to arrive at target numbers with counting
            valid_counts: List[int] = []
            if answer_type in ["number"]:
                numbers_for_count = list(range(self.max_count + 1))
                valid_counts = DropReader.find_valid_counts(
                    numbers_for_count, target_numbers)

            # Update metadata with answer info
            answer_info = {
                "answer_passage_spans": valid_passage_spans,
                "answer_question_spans": valid_question_spans,
                "num_spans": num_spans,
                "expressions": valid_expressions,
                "counts": valid_counts
            }
            if self.exp_search in ['template', 'full']:
                answer_info['expr_text'] = exp_strings
            metadata["answer_info"] = answer_info

            # Add answer fields
            passage_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_passage_field)
                for span in valid_passage_spans
            ]
            if not passage_span_fields:
                passage_span_fields.append(
                    SpanField(-1, -1, question_passage_field))
            fields["answer_as_passage_spans"] = ListField(passage_span_fields)

            question_span_fields: List[Field] = [
                SpanField(span[0], span[1], question_passage_field)
                for span in valid_question_spans
            ]
            if not question_span_fields:
                question_span_fields.append(
                    SpanField(-1, -1, question_passage_field))
            fields["answer_as_question_spans"] = ListField(
                question_span_fields)

            if self.exp_search == 'add_sub':
                add_sub_signs_field: List[Field] = []
                extra_signs_field: List[Field] = []
                for signs_for_one_add_sub_expressions in valid_expressions:
                    extra_signs = signs_for_one_add_sub_expressions[:len(
                        self.extra_numbers)]
                    normal_signs = signs_for_one_add_sub_expressions[
                        len(self.extra_numbers):]
                    add_sub_signs_field.append(
                        SequenceLabelField(normal_signs,
                                           numbers_in_passage_field))
                    extra_signs_field.append(
                        SequenceLabelField(extra_signs, extra_numbers_field))
                if not add_sub_signs_field:
                    add_sub_signs_field.append(
                        SequenceLabelField([0] * len(number_tokens),
                                           numbers_in_passage_field))
                if not extra_signs_field:
                    extra_signs_field.append(
                        SequenceLabelField([0] * len(self.extra_numbers),
                                           extra_numbers_field))
                fields["answer_as_expressions"] = ListField(
                    add_sub_signs_field)
                if self.extra_numbers:
                    fields["answer_as_expressions_extra"] = ListField(
                        extra_signs_field)
            elif self.exp_search in ['template', 'full']:
                expression_indices = []
                for expression in valid_expressions:
                    if not expression:
                        expression.append(3 * [-1])
                    expression_indices.append(
                        ArrayField(np.array(expression), padding_value=-1))
                if not expression_indices:
                    expression_indices = \
                        [ArrayField(np.array([3 * [-1]]), padding_value=-1) for _ in range(len(self.templates))]
                fields["answer_as_expressions"] = ListField(expression_indices)

            count_fields: List[Field] = [
                LabelField(count_label, skip_indexing=True)
                for count_label in valid_counts
            ]
            if not count_fields:
                count_fields.append(LabelField(-1, skip_indexing=True))
            fields["answer_as_counts"] = ListField(count_fields)

            fields["num_spans"] = LabelField(num_spans, skip_indexing=True)

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)
Example #25
0
    def text_to_instance(self,
                         source_string: str,
                         target_lang: str,
                         target_string: str = None) -> Instance:
        """
        Turn raw source string and target string into an ``Instance``.
        Parameters
        ----------
        source_string : ``str``, required
        target_lang : ``str``, required
        target_string : ``str``, optional (default = None)
        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source[1:-1], self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]]
        }

        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        if self._provide_trg_lang:
            lang_id_field = LabelField(
                target_lang, label_namespace=self._language_id_namespace)
            metadata_trg_lang = MetadataField(target_lang)

            fields_dict["target_lang"] = lang_id_field
            fields_dict["target_language"] = metadata_trg_lang

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source) - 2]
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ) - 2:]
            fields_dict["target_token_ids"] = ArrayField(
                np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
Example #26
0
    def text_to_instance(
            self,  # type: ignore
            question_text: str,
            passage_text: str,
            question_id: str = None,
            passage_id: str = None,
            answer_annotations: List[Dict] = None,
            passage_tokens: List[Token] = None) -> Union[Instance, None]:
        # pylint: disable=arguments-differ
        if not passage_tokens:
            passage_tokens = self._tokenizer.tokenize(passage_text)
            passage_tokens = split_tokens_by_hyphen(passage_tokens)
        question_tokens = self._tokenizer.tokenize(question_text)
        question_tokens = split_tokens_by_hyphen(question_tokens)
        if self.passage_length_limit is not None:
            passage_tokens = passage_tokens[:self.passage_length_limit]
        if self.question_length_limit is not None:
            question_tokens = question_tokens[:self.question_length_limit]

        answer_type: str = None
        answer_texts: List[str] = []
        if answer_annotations:
            # Currently we only use the first annotated answer here, but actually this doesn't affect
            # the training, because we only have one annotation for the train set.
            answer_type, answer_texts = self.extract_answer_info_from_annotation(
                answer_annotations[0])

        # Tokenize the answer text in order to find the matched span based on token
        tokenized_answer_texts = []
        for answer_text in answer_texts:
            answer_tokens = self._tokenizer.tokenize(answer_text)
            answer_tokens = split_tokens_by_hyphen(answer_tokens)
            tokenized_answer_texts.append(' '.join(token.text
                                                   for token in answer_tokens))

        if self.instance_format == "squad":
            valid_passage_spans = \
                self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else []
            if not valid_passage_spans:
                if "passage_span" in self.skip_when_all_empty:
                    return None
                else:
                    valid_passage_spans.append(
                        (len(passage_tokens) - 1, len(passage_tokens) - 1))
            return make_reading_comprehension_instance(
                question_tokens,
                passage_tokens,
                self._token_indexers,
                passage_text,
                valid_passage_spans,
                # this `answer_texts` will not be used for evaluation
                answer_texts,
                additional_metadata={
                    "original_passage": passage_text,
                    "original_question": question_text,
                    "passage_id": passage_id,
                    "question_id": question_id,
                    "valid_passage_spans": valid_passage_spans,
                    "answer_annotations": answer_annotations
                })
        elif self.instance_format == "bert":
            question_concat_passage_tokens = question_tokens + [
                Token("[SEP]")
            ] + passage_tokens
            valid_passage_spans = []
            for span in self.find_valid_spans(passage_tokens,
                                              tokenized_answer_texts):
                # This span is for `question + [SEP] + passage`.
                valid_passage_spans.append(
                    (span[0] + len(question_tokens) + 1,
                     span[1] + len(question_tokens) + 1))
            if not valid_passage_spans:
                if "passage_span" in self.skip_when_all_empty:
                    return None
                else:
                    valid_passage_spans.append(
                        (len(question_concat_passage_tokens) - 1,
                         len(question_concat_passage_tokens) - 1))
            answer_info = {
                "answer_texts":
                answer_texts,  # this `answer_texts` will not be used for evaluation
                "answer_passage_spans": valid_passage_spans
            }
            return self.make_bert_drop_instance(question_tokens,
                                                passage_tokens,
                                                question_concat_passage_tokens,
                                                self._token_indexers,
                                                passage_text,
                                                answer_info,
                                                additional_metadata={
                                                    "original_passage":
                                                    passage_text,
                                                    "original_question":
                                                    question_text,
                                                    "passage_id":
                                                    passage_id,
                                                    "question_id":
                                                    question_id,
                                                    "answer_annotations":
                                                    answer_annotations
                                                })
        elif self.instance_format == "drop":
            numbers_in_passage = []
            number_indices = []
            for token_index, token in enumerate(passage_tokens):
                number = self.convert_word_to_number(token.text)
                if number is not None:
                    numbers_in_passage.append(number)
                    number_indices.append(token_index)
            # hack to guarantee minimal length of padded number
            numbers_in_passage.append(0)
            number_indices.append(-1)
            numbers_as_tokens = [
                Token(str(number)) for number in numbers_in_passage
            ]

            valid_passage_spans = \
                self.find_valid_spans(passage_tokens, tokenized_answer_texts) if tokenized_answer_texts else []
            valid_question_spans = \
                self.find_valid_spans(question_tokens, tokenized_answer_texts) if tokenized_answer_texts else []

            target_numbers = []
            # `answer_texts` is a list of valid answers.
            for answer_text in answer_texts:
                number = self.convert_word_to_number(answer_text)
                if number is not None:
                    target_numbers.append(number)
            valid_signs_for_add_sub_expressions: List[List[int]] = []
            valid_counts: List[int] = []
            if answer_type in ["number", "date"]:
                valid_signs_for_add_sub_expressions = self.find_valid_add_sub_expressions(
                    numbers_in_passage, target_numbers)
            if answer_type in ["number"]:
                # Currently we only support count number 0 ~ 9
                numbers_for_count = list(range(10))
                valid_counts = self.find_valid_counts(numbers_for_count,
                                                      target_numbers)

            type_to_answer_map = {
                "passage_span": valid_passage_spans,
                "question_span": valid_question_spans,
                "addition_subtraction": valid_signs_for_add_sub_expressions,
                "counting": valid_counts
            }

            if self.skip_when_all_empty \
                    and not any(type_to_answer_map[skip_type] for skip_type in self.skip_when_all_empty):
                return None

            answer_info = {
                "answer_texts":
                answer_texts,  # this `answer_texts` will not be used for evaluation
                "answer_passage_spans": valid_passage_spans,
                "answer_question_spans": valid_question_spans,
                "signs_for_add_sub_expressions":
                valid_signs_for_add_sub_expressions,
                "counts": valid_counts
            }

            return self.make_marginal_drop_instance(question_tokens,
                                                    passage_tokens,
                                                    numbers_as_tokens,
                                                    number_indices,
                                                    self._token_indexers,
                                                    passage_text,
                                                    answer_info,
                                                    additional_metadata={
                                                        "original_passage":
                                                        passage_text,
                                                        "original_question":
                                                        question_text,
                                                        "original_numbers":
                                                        numbers_in_passage,
                                                        "passage_id":
                                                        passage_id,
                                                        "question_id":
                                                        question_id,
                                                        "answer_info":
                                                        answer_info,
                                                        "answer_annotations":
                                                        answer_annotations
                                                    })
        else:
            raise ValueError(
                f"Expect the instance format to be \"drop\", \"squad\" or \"bert\", "
                f"but got {self.instance_format}")
Example #27
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[str],
            pos_tags: List[str] = None,
            gold_tree: Tree = None) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            The tokens in a given sentence.
        pos_tags ``List[str]``, optional, (default = None).
            The POS tags for the words in the sentence.
        gold_tree : ``Tree``, optional (default = None).
            The gold parse tree to create span labels from.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence.
            pos_tags : ``SequenceLabelField``
                The POS tags of the words in the sentence.
                Only returned if ``use_pos_tags`` is ``True``
            spans : ``ListField[SpanField]``
                A ListField containing all possible subspans of the
                sentence.
            span_labels : ``SequenceLabelField``, optional.
                The constiutency tags for each of the possible spans, with
                respect to a gold parse tree. If a span is not contained
                within the tree, a span will have a ``NO-LABEL`` label.
            gold_tree : ``MetadataField(Tree)``
                The gold NLTK parse tree for use in evaluation.
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(x) for x in tokens],
                               token_indexers=self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        if self._use_pos_tags and pos_tags is not None:
            pos_tag_field = SequenceLabelField(pos_tags, text_field,
                                               "pos_tags")
            fields["pos_tags"] = pos_tag_field
        elif self._use_pos_tags:
            raise ConfigurationError(
                "use_pos_tags was set to True but no gold pos"
                " tags were passed to the dataset reader.")
        spans: List[Field] = []
        gold_labels = []

        if gold_tree is not None:
            gold_spans_with_pos_tags: Dict[Tuple[int, int], str] = {}
            self._get_gold_spans(gold_tree, 0, gold_spans_with_pos_tags)
            gold_spans = {
                span: label
                for (span, label) in gold_spans_with_pos_tags.items()
                if "-POS" not in label
            }
        else:
            gold_spans = None
        for start, end in enumerate_spans(tokens):
            spans.append(SpanField(start, end, text_field))

            if gold_spans is not None:
                if (start, end) in gold_spans.keys():
                    gold_labels.append(gold_spans[(start, end)])
                else:
                    gold_labels.append("NO-LABEL")

        metadata = {"tokens": tokens}
        if gold_tree:
            metadata["gold_tree"] = gold_tree

        fields["metadata"] = MetadataField(metadata)

        span_list_field: ListField = ListField(spans)
        fields["spans"] = span_list_field
        if gold_tree is not None:
            fields["span_labels"] = SequenceLabelField(gold_labels,
                                                       span_list_field)
        return Instance(fields)
Example #28
0
    def _read(self, file_path: str):
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset = json.load(dataset_file)

        # if self._span_file_path is not None:
        span_file = open(self._span_file_path)

        span_file = json.load(span_file)
        #archive = load_archive(self._extraction_model_path)
        #model = archive.model
        model = None
        p1_dataset_reader = DatasetReader.from_params(
            archive.config["dataset_reader"])
        p1_token_indexers = p1_dataset_reader._token_indexers

        logger.info("Reading the dataset")
        for data, best_span in zip(dataset, span_file):
            answer = data['answers'][0]
            question = data['query']
            well_formed_answer = data['wellFormedAnswers'][0]
            passages_json = data['passages']
            passages = [
                passages_json[i]['passage_text']
                for i in range(len(passages_json))
            ]
            # passages_length = [len(p) for p in passages]
            passages_is_selected = [
                passages_json[i]['is_selected']
                for i in range(len(passages_json))
            ]
            # concatenated_passage = ' '.join(passages)
            tokenized_passages_list = [
                self._tokenizer.tokenize(util.normalize_text(p))
                for p in passages
            ]
            passages_length = [len(p) for p in tokenized_passages_list]
            cumulative_passages_length = np.cumsum(passages_length)

            normalized_answer = None
            if answer != None:
                normalized_answer = util.normalize_text(answer)
            normalized_question = util.normalize_text(question)

            tokenized_answer = self._tokenizer.tokenize(normalized_answer)
            tokenized_question = self._tokenizer.tokenize(normalized_question)

            question_field = TextField(tokenized_question,
                                       self._token_indexers)
            fields = {'question': question_field}

            start_idx, end_idx, rouge_score, passage_idx = None, None, None, None

            tokenized_answer.insert(0, Token(START_SYMBOL))
            tokenized_answer.append(Token(END_SYMBOL))
            tokenized_passage = [
                token for sublist in tokenized_passages_list
                for token in sublist
            ]
            passage_field = TextField(tokenized_passage, self._token_indexers)
            fields['passage'] = passage_field

            p1_question_field = TextField(tokenized_question,
                                          p1_token_indexers)
            p1_passage_field = TextField(tokenized_passage, p1_token_indexers)
            p1_fields = {
                'question': p1_question_field,
                'passage': p1_passage_field
            }
            p1_instance = Instance(p1_fields)
            outputs = model.forward_on_instance(p1_instance, -1)

            start_idx = outputs['span_start_idx']
            end_idx = outputs['span_end_idx']
            for idx in range(len(cumulative_passages_length)):
                if start_idx < cumulative_passages_length[idx]:
                    break

            if idx != 0:
                start_idx = start_idx - cumulative_passages_length[idx - 1]
                end_idx = end_idx - cumulative_passages_length[idx - 1]

            assert start_idx <= end_idx, "Span prediction does not make sense!!!"

            # yield instance from predicted span
            span_start_field = IndexField(int(start_idx), passage_field)
            span_end_field = IndexField(int(end_idx), passage_field)
            answer_field = TextField(tokenized_answer, self._token_indexers)

            fields['passage'] = passage_field
            fields['span_start'] = span_start_field
            fields['span_end'] = span_end_field
            fields['answer'] = answer_field

            evidence = self.get_evidence(tokenized_passage, int(start_idx),
                                         int(end_idx))
            fields['metadata'] = MetadataField({
                'evidence': evidence,
                'question_text': normalized_question,
                'answer_text': normalized_answer
            })

            yield Instance(fields)

            # yield instances from gold spans
            for item in best_span:
                if item['score'] > 0.5:
                    passage_field = TextField(
                        tokenized_passages_list[item['passage']],
                        self._token_indexers)
                    span_start_field = IndexField(item['start'], passage_field)
                    span_end_field = IndexField(item['end'], passage_field)
                    answer_field = TextField(tokenized_answer,
                                             self._token_indexers)

                    fields['passage'] = passage_field
                    fields['span_start'] = span_start_field
                    fields['span_end'] = span_end_field
                    fields['answer'] = answer_field

                    evidence = self.get_evidence(
                        tokenized_passages_list[item['passage']],
                        int(start_idx), int(end_idx))
                    fields['metadata'] = MetadataField({
                        'evidence':
                        evidence,
                        'question_text':
                        normalized_question,
                        'answer_text':
                        normalized_answer
                    })

                    yield Instance(fields)