Ejemplo n.º 1
0
 def _add_target_prob_to_res(self, res, row_data):
     res[Target.TARGET_PROB_FIELD] = parse_json_array(
         row_data[DFColumn.TARGET_PROBS])
     res[Target.TARGET_LABEL_FIELD] = parse_json_array(
         row_data[DFColumn.TARGET_LABELS])
     res[Target.TARGET_LOGITS_FIELD] = parse_json_array(
         row_data[DFColumn.TARGET_LOGITS])
Ejemplo n.º 2
0
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        sequence = data.parse_json_array(row_data[DFColumn.UTTERANCE])

        features_list = [
            self.featurizer.featurize(InputRecord(raw_text=utterance))
            for utterance in sequence
        ]

        return {
            # features
            DatasetFieldName.TEXT_FIELD:
            [utterance.tokens for utterance in features_list],
            # labels
            DatasetFieldName.DOC_LABEL_FIELD:
            row_data[DFColumn.DOC_LABEL],
            DatasetFieldName.UTTERANCE_FIELD:
            row_data[DFColumn.UTTERANCE],
        }
Ejemplo n.º 3
0
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        sequence = data.parse_json_array(row_data[DFColumn.UTTERANCE])

        features_list = [
            self.featurizer.featurize(InputRecord(raw_text=utterance))
            for utterance in sequence
        ]
        res = {
            # features
            ModelInput.WORD_FEAT:
            [utterance.tokens for utterance in features_list],
            # labels
            DatasetFieldName.DOC_LABEL_FIELD:
            row_data[DFColumn.DOC_LABEL],
            DatasetFieldName.UTTERANCE_FIELD:
            row_data[DFColumn.UTTERANCE],
        }
        if DFColumn.DENSE_FEAT in row_data:
            res[ModelInput.DENSE_FEAT] = row_data.get(DFColumn.DENSE_FEAT)
        return res
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        """Preprocess steps for a single input row: 1. apply tokenization to a
        sequence of utterances; 2. process dictionary features to align with
        the last utterance. 3. align word labels with the last utterance.

        Args:
            row_data (Dict[str, Any]): Dict of one row data with column names as keys.
                Keys includes "doc_label", "word_label", "text", "dict_feat",
                "word weight" and "doc weight".

        Returns:
            Dict[str, Any]: Preprocessed dict of one row data includes:

                "seq_word_feat" (list of list of string)
                    tokenized words of sequence of utterances
                "word_feat" (list of string)
                    tokenized words of last utterance
                "raw_word_label" (string)
                    raw word label
                "token_range" (list of tuple)
                    token ranges of word labels, each tuple contains the start
                    position index and the end position index
                "utterance" (list of string)
                    raw utterances
                "word_label" (list of string)
                    list of labels of words in last utterance
                "doc_label" (string)
                    doc label for intent classification
                "word_weight" (float)
                    weight of word label
                "doc_weight" (float)
                    weight of document label
                "dict_feat" (tuple, optional)
                    tuple of three lists, the first is the label of each words,
                    the second is the weight of the feature, the third is the
                    length of the feature.

        """
        sequence = data.parse_json_array(row_data[RawData.TEXT])

        # ignore dictionary feature for context sentences other than the last one
        features_list = [
            self.featurizer.featurize(InputRecord(raw_text=utterance))
            for utterance in sequence[:-1]
        ]

        # adding dictionary feature for the last (current) message
        features_list.append(
            self.featurizer.featurize(
                InputRecord(
                    raw_text=sequence[-1],
                    raw_gazetteer_feats=row_data.get(ModelInput.DICT, ""),
                )))

        res = {
            # features
            ModelInput.SEQ: [utterance.tokens for utterance in features_list],
            ModelInput.TEXT:
            features_list[-1].tokens,
            ModelInput.DICT: (
                features_list[-1].gazetteer_feats,
                features_list[-1].gazetteer_feat_weights,
                features_list[-1].gazetteer_feat_lengths,
            ),
            ModelInput.CHAR:
            features_list[-1].characters,
            ModelInput.PRETRAINED:
            features_list[-1].pretrained_token_embedding,
            # labels
            DocLabelConfig._name:
            row_data[RawData.DOC_LABEL],
            # extra data
            # TODO move the logic to FloatField
            ExtraField.DOC_WEIGHT:
            row_data.get(RawData.DOC_WEIGHT) or 1.0,
            ExtraField.WORD_WEIGHT:
            row_data.get(RawData.WORD_WEIGHT) or 1.0,
            ExtraField.RAW_WORD_LABEL:
            row_data[RawData.WORD_LABEL],
            ExtraField.UTTERANCE:
            row_data[RawData.TEXT],
            ExtraField.TOKEN_RANGE:
            features_list[-1].token_ranges,
        }
        if WordLabelConfig._name in self.labels:
            # TODO move it into word label field
            res[WordLabelConfig._name] = data.align_slot_labels(
                features_list[-1].token_ranges,
                row_data[RawData.WORD_LABEL],
                self.labels[WordLabelConfig._name].use_bio_labels,
            )
        return res