コード例 #1
0
    def text_to_instance(self, source_string: str, target_string: str = None) -> Instance:  # type: ignore
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """
        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace)

        meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]}
        fields_dict = {
                "source_tokens": source_field,
                "source_to_target": source_to_target_field,
        }

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]]
            source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] +
                                                              tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2]
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:]
            fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))

        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
コード例 #2
0
    def text_to_instance(
        self, source_string: str, target_string: str = None
    ) -> Instance:  # type: ignore
        """
        Turn raw source string and target string into an `Instance`.

        # Parameters

        source_string : `str`, required
        target_string : `str`, optional (default = None)

        # Returns

        Instance
            See the above for a description of the fields that the instance will contain.
        """

        tokenized_source = self._source_tokenizer.tokenize(source_string)
        if not tokenized_source:
            # If the tokenized source is empty, it will cause issues downstream.
            raise ValueError(f"source tokenizer produced no tokens from source '{source_string}'")
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(tokenized_source, self._target_namespace)

        meta_fields = {"source_tokens": [x.text for x in tokenized_source]}
        fields_dict = {"source_tokens": source_field, "source_to_target": source_to_target_field}

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]]
            source_and_target_token_ids = self._tokens_to_ids(tokenized_source + tokenized_target)
            source_token_ids = source_and_target_token_ids[: len(tokenized_source)]
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source) :]
            fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source)
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))

        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
コード例 #3
0
    def text_to_instance(
            self,  # type: ignore
            rule_text: str,
            question: str,
            scenario: str,
            history: List[Dict[str, str]],
            utterance_id: str = None,
            tree_id: str = None,
            source_url: str = None,
            answer: str = None,
            evidence: List[Dict[str, str]] = None) -> Optional[Instance]:
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """

        # For CopyNet Model
        source_string = rule_text
        for follow_up_qna in history:
            source_string += ' @@||@@ '
            source_string += follow_up_qna['follow_up_question']
        target_string = answer

        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source[1:-1], self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]]
        }
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        # For BiDAF model
        passage_text = rule_text
        question_text = question + ' @@||@@ ' + scenario
        for follow_up_qna in history:
            question_text += ' @@||@@ '
            question_text += follow_up_qna['follow_up_question']
            question_text += ' @@?@@ '
            question_text += follow_up_qna['follow_up_answer']

        passage_tokens = self._bidaf_tokenizer.tokenize(passage_text)
        question_tokens = self._bidaf_tokenizer.tokenize(question_text)

        fields_dict['passage'] = TextField(passage_tokens,
                                           self._bidaf_token_indexers)
        fields_dict['question'] = TextField(question_tokens,
                                            self._bidaf_token_indexers)

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source) - 2]
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ) - 2:]
            fields_dict["target_token_ids"] = ArrayField(
                np.array(target_token_ids))

            action = 'More' if answer not in ['Yes', 'No', 'Irrelevant'
                                              ] else answer
            fields_dict['label'] = LabelField(action)
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        meta_fields['rule_text'] = rule_text
        meta_fields['question'] = question
        meta_fields['scenario'] = scenario
        meta_fields['history'] = history
        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
コード例 #4
0
    def text_to_instance(self, source_key: str, target_key: str = None, line_obj: Dict = {}) -> Instance:
        """
        Turn json object into an ``Instance``.
        Parameters
        ----------
        source_key : ``str``, required, json object key name of the source sequence
        target_key : ``str``, optional (default = None), json object key name of the target sequence
        line_obj : ``Dict``, required, json object containing the raw instance info
        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """

        # Read source and target
        target_sequence = line_obj.get(target_key, None)
        lang_src_token = line_obj["src_lang"].upper()
        lang_tgt_token = line_obj["tgt_lang"].upper()

        # Read Predicate Indicator and make Array
        verb_label = [0, 0] + [1 if label[-2:] == "-V" else 0 for label in line_obj["BIO"]] + [0]

        # Read Language Indicator and make Array
        lang_src_ix = self._available_languages[lang_src_token]
        lang_tgt_ix = self._available_languages[lang_tgt_token]
        # This array goes to the encoder as a whole
        lang_src_ix_arr = [0, 0] + [lang_src_ix for tok in line_obj[source_key]] + [0]
        # This array goes to each one of the decoder_steps
        lang_tgt_ix_arr = lang_tgt_ix # is just int for step decoder dimensionality

        # Tokenize Source
        tokenized_source = list(map(Token, line_obj[source_key])) # Data comes already tokenized!
        tokenized_source.insert(0, Token(lang_tgt_token))
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace)

        meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]}
        fields_dict = {
                "source_tokens": source_field,
                "source_to_target": source_to_target_field,
        }

        # Process Target info during training...
        if target_sequence is not None:
            tokenized_target = list(map(Token, line_obj[target_key]))
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]]
            source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] +
                                                              tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2]
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:]
            fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))

        # Add Verb Indicator to the Fields
        fields_dict['verb_indicator'] = SequenceLabelField(verb_label, source_field)
        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokenized_source[verb_label.index(1)].text
        meta_fields["verb"] = verb

        # Add Language Indicator to the Fields
        meta_fields["src_lang"] = lang_src_token
        meta_fields["tgt_lang"] = lang_tgt_token
        meta_fields["original_BIO"] = line_obj.get("BIO", [])
        meta_fields["original_predicate_senses"] = line_obj.get("pred_sense_origin", [])
        meta_fields["predicate_senses"] = line_obj.get("pred_sense", [])
        meta_fields["original_target"] = line_obj.get("seq_tag_tokens", [])
        fields_dict['language_enc_indicator'] = ArrayField(np.array(lang_src_ix_arr))
        fields_dict['language_dec_indicator'] = ArrayField(np.array(lang_tgt_ix_arr))

        fields_dict["metadata"] = MetadataField(meta_fields)
        return Instance(fields_dict)
コード例 #5
0
    def text_to_instance(self,
            source: str,
            target: str = None
        ) -> Instance:
        def prepare_text(text, max_tokens):
            tokens = self.tokenizer.tokenize(text)[0:max_tokens]
            tokens.insert(0,Token(START_SYMBOL))
            tokens.append(Token(END_SYMBOL))

            return tokens

        # tokenize source sequence
        source_tokens = prepare_text(source,self.source_max_tokens)
        source_tokens_indexed = TextField(source_tokens, self.source_token_indexers)

        result = {'source_tokens': source_tokens_indexed}

        # meta_fields

        meta_fields = {}

        # copy

        if self.save_copy_fields:
            source_to_target_field = NamespaceSwappingField(source_tokens[1:-1],self.target_namespace)
            result['source_to_target'] = source_to_target_field
            meta_fields['source_tokens'] = [x.text for x in source_tokens[1:-1]]
        # pointer

        if self.save_pgn_fields:
            source_to_target_field = NamespaceSwappingField(source_tokens, self.target_namespace)
            result['source_to_target'] = source_to_target_field
            meta_fields['source_tokens'] = [x.text for x in source_tokens]

        if target:
            # target_tokens
            target_tokens = prepare_text(target,self.target_max_tokens)
            target_tokens_indexed = TextField(target_tokens,self.target_token_indexers)
            result['target_tokens'] = target_tokens_indexed

            if self.save_copy_fields:
                meta_fields['target_tokens'] = [y.text for y in target_tokens[1:-1]]
                source_and_target_token_ids = self._tokens_to_ids(source_tokens[1:-1] + target_tokens)
                source_token_ids = source_and_target_token_ids[:len(source_tokens)-2]
                result['source_token_ids'] = ArrayField(np.array(source_token_ids,dtype='long'))

                target_token_ids = source_and_target_token_ids[len(source_tokens)-2:]
                result['target_token_ids'] = ArrayField(np.array(target_token_ids,dtype='long'))


            if self.save_pgn_fields:
                meta_fields['target_tokens'] = [y.text for y in target_tokens]
                source_and_target_token_ids = self._tokens_to_ids(source_tokens + target_tokens)
                source_token_ids = source_and_target_token_ids[:len(source_tokens)]
                result['source_token_ids'] = ArrayField(np.array(source_token_ids,dtype='long'))

                target_token_ids = source_and_target_token_ids[len(source_tokens):]
                result['target_token_ids'] = ArrayField(np.array(target_token_ids,dtype='long'))

        elif self.save_copy_fields:
            source_token_ids = self._tokens_to_ids(source_tokens[1:-1])
            result['source_token_ids'] = ArrayField(np.array(source_token_ids))
        elif self.save_pgn_fields:
            source_token_ids = self._tokens_to_ids(source_tokens)
            result['source_token_ids'] = ArrayField(np.array(source_token_ids))

        if self.save_copy_fields or self.save_pgn_fields:
            result['metadata'] = MetadataField(meta_fields)

        return Instance(result)
コード例 #6
0
    def text_to_instance(self, source: str, target: str = None) -> Instance:
        def prepare_text(text, max_tokens):
            text = text.lower() if self._lowercase else text
            tokens = self._tokenizer.tokenize(text)[:max_tokens]
            tokens.insert(0, Token(START_SYMBOL))
            tokens.append(Token(END_SYMBOL))
            return tokens

        source_tokens = prepare_text(source, self._source_max_tokens)
        source_tokens_indexed = TextField(source_tokens,
                                          self._source_token_indexers)
        result = {'source_tokens': source_tokens_indexed}
        meta_fields = {}

        if self._save_copy_fields:
            source_to_target_field = NamespaceSwappingField(
                source_tokens[1:-1], self._target_namespace)
            result["source_to_target"] = source_to_target_field
            meta_fields["source_tokens"] = [
                x.text for x in source_tokens[1:-1]
            ]

        if self._save_pgn_fields:
            source_to_target_field = NamespaceSwappingField(
                source_tokens, self._target_namespace)
            result["source_to_target"] = source_to_target_field
            meta_fields["source_tokens"] = [x.text for x in source_tokens]

        if target:
            target_tokens = prepare_text(target, self._target_max_tokens)
            target_tokens_indexed = TextField(target_tokens,
                                              self._target_token_indexers)
            result['target_tokens'] = target_tokens_indexed

            if self._save_pgn_fields:
                meta_fields["target_tokens"] = [y.text for y in target_tokens]
                source_and_target_token_ids = self._tokens_to_ids(
                    source_tokens + target_tokens, self._lowercase)
                source_token_ids = source_and_target_token_ids[:len(
                    source_tokens)]
                result["source_token_ids"] = ArrayField(
                    np.array(source_token_ids, dtype='long'))
                target_token_ids = source_and_target_token_ids[
                    len(source_tokens):]
                result["target_token_ids"] = ArrayField(
                    np.array(target_token_ids, dtype='long'))

            if self._save_copy_fields:
                meta_fields["target_tokens"] = [
                    y.text for y in target_tokens[1:-1]
                ]
                source_and_target_token_ids = self._tokens_to_ids(
                    source_tokens[1:-1] + target_tokens, self._lowercase)
                source_token_ids = source_and_target_token_ids[:len(
                    source_tokens) - 2]
                result["source_token_ids"] = ArrayField(
                    np.array(source_token_ids))
                target_token_ids = source_and_target_token_ids[
                    len(source_tokens) - 2:]
                result["target_token_ids"] = ArrayField(
                    np.array(target_token_ids))

        elif self._save_copy_fields:
            source_token_ids = self._tokens_to_ids(source_tokens[1:-1],
                                                   self._lowercase)
            result["source_token_ids"] = ArrayField(np.array(source_token_ids))
        elif self._save_pgn_fields:
            source_token_ids = self._tokens_to_ids(source_tokens,
                                                   self._lowercase)
            result["source_token_ids"] = ArrayField(np.array(source_token_ids))
        if self._save_copy_fields or self._save_pgn_fields:
            result["metadata"] = MetadataField(meta_fields)
        return Instance(result)
コード例 #7
0
ファイル: copy_seq2multiseq.py プロジェクト: vaibhavad/imojie
    def text_to_instance(
            self,
            source_string: str,
            target_strings: str = None,
            example_id: str = None,
            validation: bool = False,
            gradients: bool = False,
            confidences: float = None) -> Instance:  # type: ignore
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """
        # pylint: disable=arguments-differ
        if target_strings is not None:
            target_strings += ['EOE']  ## End of extractions
            confidences += [1]

        if self._bert:
            source_string = bert_utils.replace_strings(source_string)
            if target_strings is not None:
                rep_target_strings = []
                for target_string in target_strings:
                    rep_target_strings.append(
                        bert_utils.replace_strings(target_string))
                target_strings = rep_target_strings

        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source[1:-1], self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]],
            "example_ids": example_id,
            "validation": validation,
            "gradients": gradients,
            "confidences": confidences
        }
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        if target_strings is not None:
            target_fields, tokenized_targets, source_token_idss, target_token_idss = [], [], [], []
            num_target_tokens = 0
            for i in range(len(target_strings)):
                tokenized_target = self._target_tokenizer.tokenize(
                    target_strings[i])
                tokenized_target.insert(0, Token(START_SYMBOL))
                tokenized_target.append(Token(END_SYMBOL))
                tokenized_targets.append(tokenized_target)
                num_target_tokens += len(tokenized_target)
                target_field = TextField(tokenized_target,
                                         self._target_token_indexers)
                target_fields.append(target_field)

                source_and_target_token_ids = self._tokens_to_ids(
                    tokenized_source[1:-1] + tokenized_target)

                source_token_ids = source_and_target_token_ids[:len(
                    tokenized_source) - 2]

                target_token_ids = source_and_target_token_ids[
                    len(tokenized_source) - 2:]
                target_token_idss.append(ArrayField(
                    np.array(target_token_ids)))

            fields_dict["target_tokens"] = ListField(target_fields)
            meta_fields["target_tokens"] = [[
                y.text for y in tokenized_target[1:-1]
            ] for tokenized_target in tokenized_targets]
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))
            fields_dict["target_token_ids"] = ListField(target_token_idss)

            # confidences = np.array(confidences)
            # confidence_field = ArrayField(confidences)
            # fields_dict['confidences'] = confidence_field
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        fields_dict["metadata"] = MetadataField(meta_fields)

        if (self._max_tokens != None and target_strings != None and
                len(tokenized_source) + num_target_tokens >= self._max_tokens):
            return None

        return Instance(fields_dict)
コード例 #8
0
    def text_to_instance(self,
                         document: List[str],
                         topics: List[str],
                         context: List[str],
                         cloze: Optional[str] = None) -> Instance:
        """
        Parameters
        ----------
        document:
            The list of document sentences.
        topics:
            The list of topics.
        context:
            The list of context sentences.
        cloze:
            The cloze string.
        """
        fields = {}

        # There is some weirdness that can happen if the document tokens are lowercased
        # but the context/cloze tokens are not (or vice versa). We will deal with that when
        # it's necessary. For now, we don't allow it.
        assert self.document_token_indexers['tokens'].lowercase_tokens == self.cloze_token_indexers['tokens'].lowercase_tokens
        assert self.context_token_indexers['tokens'].lowercase_tokens == self.cloze_token_indexers['tokens'].lowercase_tokens
        if self.document_token_indexers['tokens'].lowercase_tokens:
            document = [sentence.lower() for sentence in document]
            context = [sentence.lower() for sentence in context]
            if cloze is not None:
                cloze = cloze.lower()

        # Setup the document field
        tokenized_document = self.document_tokenizer.tokenize(document)
        if self.max_document_length is not None:
            tokenized_document = tokenized_document[:self.max_document_length]
        fields['document'] = TextField(tokenized_document, self.document_token_indexers)

        # Get the document token indices but in the cloze namespace
        fields['document_in_cloze_namespace'] = NamespaceSwappingField(tokenized_document, self.cloze_namespace)

        # Build a map from token to all of the indices that token appears
        document_token_to_indices = get_token_to_index_map(tokenized_document)

        # Get a field that, for every document token, has the first index within
        # the document that token appears
        fields['document_token_first_indices'] = \
            get_first_indices_field(tokenized_document, document_token_to_indices)

        # Setup the topics
        tokenized_topics = [self.topic_tokenizer.tokenize(topic) for topic in topics]
        topic_fields = [TextField(tokenized_topic, self.topic_token_indexers) for tokenized_topic in tokenized_topics]
        fields['topics'] = ListField(topic_fields)

        # Setup the context
        tokenized_context = self.context_tokenizer.tokenize(context)
        if self.max_context_length is not None:
            # We take the last tokens instead of the first because the cloze
            # comes immediately after the context
            tokenized_context = tokenized_context[-self.max_context_length:]
        fields['context'] = TextField(tokenized_context, self.context_token_indexers)

        context_token_document_indices_field, mask_field = \
            get_token_mapping_field(document_token_to_indices, tokenized_context)
        fields['context_token_document_indices'] = context_token_document_indices_field
        fields['context_token_document_indices_mask'] = mask_field

        # Setup the cloze field, if it exists
        if cloze is not None:
            tokenized_cloze = self.cloze_tokenizer.tokenize(cloze)
            if self.max_cloze_length is not None:
                tokenized_cloze = tokenized_cloze[:self.max_cloze_length]
            fields['cloze'] = TextField(tokenized_cloze, self.cloze_token_indexers)

            cloze_token_document_indices_field, mask_field = \
                get_token_mapping_field(document_token_to_indices, tokenized_cloze)
            fields['cloze_token_document_indices'] = cloze_token_document_indices_field
            fields['cloze_token_document_indices_mask'] = mask_field

        # Pass the original data through as metadata
        metadata = {}
        metadata['document'] = document
        metadata['document_tokens'] = [str(token) for token in tokenized_document]
        metadata['topics'] = topics
        metadata['context'] = context
        if cloze is not None:
            metadata['cloze'] = cloze
        fields['metadata'] = MetadataField(metadata)

        return Instance(fields)
コード例 #9
0
ファイル: multiwoz.py プロジェクト: voidforall/DialSummar
    def text_to_instance(
            self,
            user: List[str],
            system: List[str],
            domains: List[str],
            usr_value_dict: Dict[int, str],
            sys_value_dict: Dict[int, str],
            # acts: List[List[str]],
            target: List[str] = None) -> Instance:

        fields_dict: Dict[str, Field] = {}

        # Note that it's a non-hierarchical model,
        # so the user/system/target are all TextField
        user_string = " ".join(user)
        tokenized_user = self._tokenizer.tokenize(user_string)
        tokenized_user.insert(0, Token(START_SYMBOL))
        tokenized_user.append(Token(END_SYMBOL))
        user_field = TextField(tokenized_user, self._token_indexers)
        fields_dict["user_tokens"] = user_field

        sys_string = " ".join(system)
        tokenized_sys = self._tokenizer.tokenize(sys_string)
        tokenized_sys.insert(0, Token(START_SYMBOL))
        tokenized_sys.append(Token(END_SYMBOL))
        sys_field = TextField(tokenized_sys, self._token_indexers)
        fields_dict["sys_tokens"] = sys_field

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol) if there is no match
        # p.s. separate matching of user and system. wait, do they need separate namespace?
        user_to_target_field = NamespaceSwappingField(tokenized_user,
                                                      self._target_namespace)
        sys_to_target_field = NamespaceSwappingField(tokenized_sys,
                                                     self._target_namespace)
        fields_dict["user_to_target"] = user_to_target_field
        fields_dict["sys_to_target"] = sys_to_target_field

        meta_fields = {
            "user_tokens": [x.text for x in tokenized_user],
            "sys_tokens": [x.text for x in tokenized_sys],
            "user_values_dict": usr_value_dict,
            "sys_values_dict": sys_value_dict
        }

        # add: generate the mask of "delex" slots
        usr_mask = np.zeros(len(user_field))
        for k in usr_value_dict.keys():
            usr_mask[k] = 1
        fields_dict["user_value_mask"] = ArrayField(usr_mask)
        sys_mask = np.zeros(len(sys_field))
        for k in sys_value_dict.keys():
            sys_mask[k] = 1
        fields_dict["sys_value_mask"] = ArrayField(sys_mask)

        if target is not None:
            target_string = " ".join(target)
            tokenized_target = self._tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexer)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [y.text for y in tokenized_target]

            user_token_ids = self._tokens_to_ids(tokenized_user)
            sys_token_ids = self._tokens_to_ids(tokenized_sys)
            target_token_ids = self._tokens_to_ids(tokenized_target)
            fields_dict["user_token_ids"] = ArrayField(
                np.array(user_token_ids))
            fields_dict["sys_token_ids"] = ArrayField(np.array(sys_token_ids))
            fields_dict["target_token_ids"] = ArrayField(
                np.array(target_token_ids))

        else:
            user_token_ids = self._tokens_to_ids(tokenized_user)
            sys_token_ids = self._tokens_to_ids(tokenized_sys)
            fields_dict["user_token_ids"] = ArrayField(
                np.array(user_token_ids))
            fields_dict["sys_token_ids"] = ArrayField(np.array(sys_token_ids))

        domain_field = MultiLabelField(domains,
                                       label_namespace="domain_labels")
        fields_dict["domain_labels"] = domain_field

        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
コード例 #10
0
    def text_to_instance(
        self,
        source_string: str,
        target_string: str = None,
        weight: float = None,
    ) -> Instance:  # type: ignore
        """
        Turn raw source string and target string into an `Instance`.

        # Parameters

        source_string : `str`, required

        target_string : `str`, optional (default = `None`)

        weight : `float`, optional (default = `None`)
            An optional weight to assign to this instance when calculating the loss in
            [CopyNetSeq2Seq.forward()](../../models/copynet_seq2seq/#forward.parameters).

        # Returns

        `Instance`
            See the above for a description of the fields that the instance will contain.
        """

        tokenized_source = self._source_tokenizer.tokenize(source_string)
        if not tokenized_source:
            # If the tokenized source is empty, it will cause issues downstream.
            raise ValueError(
                f"source tokenizer produced no tokens from source '{source_string}'"
            )
        source_field = TextField(tokenized_source)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source, self._target_namespace)

        meta_fields = {"source_tokens": [x.text for x in tokenized_source]}
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field
        }

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source)]
            fields_dict["source_token_ids"] = TensorField(
                torch.tensor(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ):]
            fields_dict["target_token_ids"] = TensorField(
                torch.tensor(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source)
            fields_dict["source_token_ids"] = TensorField(
                torch.tensor(source_token_ids))

        fields_dict["metadata"] = MetadataField(meta_fields)

        if weight is not None:
            fields_dict["weight"] = TensorField(
                torch.tensor(float(weight), dtype=torch.float))

        return Instance(fields_dict)
コード例 #11
0
ファイル: ps_pipeline.py プロジェクト: pombredanne/UrcaNet
    def text_to_instance(self,
                         rule_text,
                         question,
                         scenario,
                         history,
                         answer=None,
                         evidence=None) -> Instance:  # type: ignore
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """
        # pylint: disable=arguments-differ

        if answer and answer in ['Yes', 'No', 'Irrelevant']:
            return None

        predicted_span, predicted_label = self.get_prediction(
            rule_text, question, scenario, history)

        if answer is not None:  # while training and validation
            token_span = self.dataset_reader.find_lcs(
                answer,
                predicted_span,
                self._source_tokenizer.tokenize,
                fuzzy_matching=False)
            if token_span is None:
                return None

            answer_offsets = [
                (token.idx, token.idx + len(token.text))
                for token in self._source_tokenizer.tokenize(answer)
            ]
            try:
                target_string1 = answer[:answer_offsets[token_span[0] - 1][1]]
                target_string2 = answer[answer_offsets[token_span[1] + 1][0]:]
            except IndexError:
                return None
        else:
            target_string1 = None
            target_string2 = None

        if self.add_rule:
            if self.embed_span:
                source_string = self.get_embedded_span(rule_text,
                                                       predicted_span)
            else:
                source_string = rule_text + ' @pss@ ' + predicted_span + ' @pse@'
        else:
            source_string = predicted_span
        if self.add_question:
            source_string += ' @qs@ ' + question + ' @qe'
        if self.add_followup_ques:
            for follow_up_qna in history:
                source_string += ' @fs@ ' + follow_up_qna[
                    'follow_up_question'] + ' @fe'

        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source[1:-1], self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]],
            "predicted_span_tokens": [
                token.text
                for token in self._source_tokenizer.tokenize(predicted_span)
            ]
        }
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        if target_string1 is not None and target_string2 is not None:
            tokenized_target1 = self._target_tokenizer.tokenize(target_string1)
            tokenized_target1.insert(0, Token(START_SYMBOL))
            tokenized_target1.append(Token(END_SYMBOL))

            tokenized_target2 = self._target_tokenizer.tokenize(target_string2)
            tokenized_target2.insert(0, Token(START_SYMBOL))
            tokenized_target2.append(Token(END_SYMBOL))

            target_field1 = TextField(tokenized_target1,
                                      self._target_token_indexers)
            target_field2 = TextField(tokenized_target2,
                                      self._target_token_indexers)

            fields_dict["target_tokens1"] = target_field1
            fields_dict["target_tokens2"] = target_field2

            meta_fields["target_tokens1"] = [
                y.text for y in tokenized_target1[1:-1]
            ]
            meta_fields["target_tokens2"] = [
                y.text for y in tokenized_target2[1:-1]
            ]

            source_and_target_token_ids1 = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target1)
            source_and_target_token_ids2 = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target2)

            source_token_ids1 = np.array(
                source_and_target_token_ids1[:len(tokenized_source) - 2])
            source_token_ids2 = np.array(
                source_and_target_token_ids2[:len(tokenized_source) - 2])
            assert np.array_equal(source_token_ids1, source_token_ids2)
            fields_dict["source_token_ids"] = ArrayField(source_token_ids1)

            target_token_ids1 = np.array(
                source_and_target_token_ids1[len(tokenized_source) - 2:])
            target_token_ids2 = np.array(
                source_and_target_token_ids2[len(tokenized_source) - 2:])

            fields_dict["target_token_ids1"] = ArrayField(target_token_ids1)
            fields_dict["target_token_ids2"] = ArrayField(target_token_ids2)
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        meta_fields['label'] = predicted_label
        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
コード例 #12
0
    def text_to_instance(
            self,  # type: ignore
            rule_text: str,
            question: str,
            scenario: str,
            history: List[Dict[str, str]],
            utterance_id: str = None,
            tree_id: str = None,
            source_url: str = None,
            answer: str = None,
            evidence: List[Dict[str, str]] = None) -> Optional[Instance]:
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """

        utterance = {'snippet': rule_text, 'question': question,\
                     'scenario': scenario, 'history': history}
        span = self.predictor.predict_json(utterance)['best_span_str']
        span_size = len(span)
        span_start_index = rule_text.find(span)
        if span_start_index != -1:
            source_string = ''
            source_string += rule_text[:span_start_index] + '@@**@@ ' + span
            source_string += ' @@**@@ ' + rule_text[span_start_index +
                                                    span_size:]
        else:
            source_string = rule_text
            print('Can\'t find span.')

        source_string += ' @@||@@ ' + question
        for follow_up_qna in history:
            source_string += ' @@||@@ '
            source_string += follow_up_qna['follow_up_question']
            source_string += ' @@?@@ '
            source_string += follow_up_qna['follow_up_answer']
        target_string = answer

        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source[1:-1], self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]]
        }
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source) - 2]
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ) - 2:]
            fields_dict["target_token_ids"] = ArrayField(
                np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        meta_fields['rule_text'] = rule_text
        meta_fields['question'] = question
        meta_fields['scenario'] = scenario
        meta_fields['history'] = history
        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
コード例 #13
0
    def text_to_instance(self, rule_text, question, scenario, history, answer=None, evidence=None) -> Instance:  # type: ignore
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """
        # pylint: disable=arguments-differ

        if answer and answer in ['Yes', 'No', 'Irrelevant']:
            return None
        target_string = answer

        if self.train_using_gold and answer is not None: # i.e. during training and validation
            predicted_label = answer if answer in ['Yes', 'No', 'Irrelevant'] else 'More'
            predicted_span_ixs = self.dataset_reader.find_lcs(rule_text, answer, self._source_tokenizer.tokenize)
            if predicted_span_ixs is None:
                return None
            else:
                rule_offsets = [(token.idx, token.idx + len(token.text)) for token in self._source_tokenizer.tokenize(rule_text)]
                predicted_span = rule_text[rule_offsets[predicted_span_ixs[0]][0]: rule_offsets[predicted_span_ixs[1]][1]]
        else:
            predicted_span, predicted_label = self.get_prediction(rule_text, question, scenario, history)

        if self.add_rule:
            if self.embed_span:
                source_string = self.get_embedded_span(rule_text, predicted_span)
            else:
                source_string = rule_text + ' @pss@ ' + predicted_span + ' @pse@'
        else:
            source_string = predicted_span
        if self.add_question:
            source_string += ' @qs@ ' + question + ' @qe'
        if self.add_followup_ques:
            for follow_up_qna in history:
                source_string += ' @fs@ ' + follow_up_qna['follow_up_question'] + ' @fe'

        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(tokenized_source[1:-1], self._target_namespace)

        meta_fields = {"source_tokens": [x.text for x in tokenized_source[1:-1]]}
        fields_dict = {
                "source_tokens": source_field,
                "source_to_target": source_to_target_field,
        }

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [y.text for y in tokenized_target[1:-1]]
            source_and_target_token_ids = self._tokens_to_ids(tokenized_source[1:-1] +
                                                              tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(tokenized_source)-2]
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source)-2:]
            fields_dict["target_token_ids"] = ArrayField(np.array(target_token_ids))
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(np.array(source_token_ids))

        meta_fields['label'] = predicted_label
        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)
コード例 #14
0
    def text_to_instance(
            self,  # type: ignore
            rule_text: str,
            question: str,
            scenario: str,
            history: List[Dict[str, str]],
            utterance_id: str = None,
            tree_id: str = None,
            source_url: str = None,
            answer: str = None,
            evidence: List[Dict[str, str]] = None) -> Optional[Instance]:
        """
        Turn raw source string and target string into an ``Instance``.

        Parameters
        ----------
        source_string : ``str``, required
        target_string : ``str``, optional (default = None)

        Returns
        -------
        Instance
            See the above for a description of the fields that the instance will contain.
        """

        # For CopyNet Model
        source_string = rule_text + ' [SEP]'
        target_string = answer

        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        tokenized_source.insert(0, Token(START_SYMBOL))
        # tokenized_source.append(Token(END_SYMBOL)) ' @@SEP@@' acts as end symbol
        source_field = TextField(tokenized_source, self._source_token_indexers)

        # For each token in the source sentence, we keep track of the matching token
        # in the target sentence (which will be the OOV symbol if there is no match).
        source_to_target_field = NamespaceSwappingField(
            tokenized_source[1:-1], self._target_namespace)

        meta_fields = {
            "source_tokens": [x.text for x in tokenized_source[1:-1]]
        }
        fields_dict = {
            "source_tokens": source_field,
            "source_to_target": source_to_target_field,
        }

        # For Bert model
        passage_text1 = rule_text + ' [SEP]'
        question_text1 = question

        passage_text2 = rule_text + ' [SEP]'
        question_text2 = scenario

        bert_input1 = passage_text1 + ' ' + question_text1
        bert_input2 = passage_text2 + ' ' + question_text2

        bert_input_tokens1 = self.get_tokens_with_history_encoding(
            bert_input1, history)
        bert_input_tokens2 = self._bert_tokenizer.tokenize(bert_input2)
        bert_input_tokens1.insert(0, Token(START_SYMBOL))
        bert_input_tokens2.insert(0, Token(START_SYMBOL))
        fields_dict['bert_input1'] = TextField(bert_input_tokens1,
                                               self._bert_token_indexers)
        fields_dict['bert_input2'] = TextField(bert_input_tokens2,
                                               self._bert_token_indexers)
        meta_fields['passage_tokens1'] = self._bert_tokenizer.tokenize(
            passage_text1)
        meta_fields['passage_tokens2'] = self._bert_tokenizer.tokenize(
            passage_text2)

        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self._target_token_indexers)

            fields_dict["target_tokens"] = target_field
            meta_fields["target_tokens"] = [
                y.text for y in tokenized_target[1:-1]
            ]
            source_and_target_token_ids = self._tokens_to_ids(
                tokenized_source[1:-1] + tokenized_target)
            source_token_ids = source_and_target_token_ids[:len(
                tokenized_source) - 2]
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))
            target_token_ids = source_and_target_token_ids[len(tokenized_source
                                                               ) - 2:]
            fields_dict["target_token_ids"] = ArrayField(
                np.array(target_token_ids))

            action = 'More' if answer not in ['Yes', 'No', 'Irrelevant'
                                              ] else answer
            fields_dict['label'] = LabelField(action)
        else:
            source_token_ids = self._tokens_to_ids(tokenized_source[1:-1])
            fields_dict["source_token_ids"] = ArrayField(
                np.array(source_token_ids))

        meta_fields['rule_text'] = rule_text
        meta_fields['question'] = question
        meta_fields['scenario'] = scenario
        meta_fields['history'] = history
        fields_dict["metadata"] = MetadataField(meta_fields)

        return Instance(fields_dict)