Beispiel #1
0
 def test_align_slot_labels(self):
     self.assertEqual(
         align_slot_labels(
             [[0, 4], [5, 8], [9, 14], [15, 19], [20, 25]],
             "20:25:music/type,5:14:music/artistName",
             True,
         ),
         "NoLabel B-music/artistName I-music/artistName NoLabel B-music/type",
     )
 def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
     features = self.featurize(row_data)
     res = {
         # feature field
         # TODO move the logic to text field
         DatasetFieldName.TEXT_FIELD:
         self._get_tokens(features),
         DatasetFieldName.DICT_FIELD: (
             features.gazetteer_feats,
             features.gazetteer_feat_weights,
             features.gazetteer_feat_lengths,
         ),
         DatasetFieldName.CHAR_FIELD:
         self._get_chars(features),
         DatasetFieldName.PRETRAINED_MODEL_EMBEDDING:
         features.pretrained_token_embedding,
         # extra data
         # TODO move the logic to FloatField
         DatasetFieldName.DOC_WEIGHT_FIELD:
         row_data.get(DFColumn.DOC_WEIGHT) or 1.0,
         DatasetFieldName.WORD_WEIGHT_FIELD:
         row_data.get(DFColumn.WORD_WEIGHT) or 1.0,
         DatasetFieldName.UTTERANCE_FIELD:
         row_data.get(DFColumn.UTTERANCE),
         DatasetFieldName.DENSE_FIELD:
         row_data.get(DatasetFieldName.DENSE_FIELD),
         DatasetFieldName.TOKEN_RANGE:
         features.token_ranges,
     }
     if DatasetFieldName.DOC_LABEL_FIELD in self.labels:
         res[DatasetFieldName.DOC_LABEL_FIELD] = row_data.get(
             DFColumn.DOC_LABEL)
     if DatasetFieldName.WORD_LABEL_FIELD in self.labels:
         # TODO move it into word label field
         res[DatasetFieldName.WORD_LABEL_FIELD] = data.align_slot_labels(
             features.token_ranges,
             row_data.get(DFColumn.WORD_LABEL),
             self.labels[DatasetFieldName.WORD_LABEL_FIELD].use_bio_labels,
         )
         res[DatasetFieldName.RAW_WORD_LABEL] = row_data.get(
             DFColumn.WORD_LABEL)
     return res
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        """Preprocess steps for a single input row: 1. apply tokenization to a
        sequence of utterances; 2. process dictionary features to align with
        the last utterance. 3. align word labels with the last utterance.

        Args:
            row_data (Dict[str, Any]): Dict of one row data with column names as keys.
                Keys includes "doc_label", "word_label", "text", "dict_feat",
                "word weight" and "doc weight".

        Returns:
            Dict[str, Any]: Preprocessed dict of one row data includes:

                "seq_word_feat" (list of list of string)
                    tokenized words of sequence of utterances
                "word_feat" (list of string)
                    tokenized words of last utterance
                "raw_word_label" (string)
                    raw word label
                "token_range" (list of tuple)
                    token ranges of word labels, each tuple contains the start
                    position index and the end position index
                "utterance" (list of string)
                    raw utterances
                "word_label" (list of string)
                    list of labels of words in last utterance
                "doc_label" (string)
                    doc label for intent classification
                "word_weight" (float)
                    weight of word label
                "doc_weight" (float)
                    weight of document label
                "dict_feat" (tuple, optional)
                    tuple of three lists, the first is the label of each words,
                    the second is the weight of the feature, the third is the
                    length of the feature.

        """
        sequence = data.parse_json_array(row_data[RawData.TEXT])

        # ignore dictionary feature for context sentences other than the last one
        features_list = [
            self.featurizer.featurize(InputRecord(raw_text=utterance))
            for utterance in sequence[:-1]
        ]

        # adding dictionary feature for the last (current) message
        features_list.append(
            self.featurizer.featurize(
                InputRecord(
                    raw_text=sequence[-1],
                    raw_gazetteer_feats=row_data.get(ModelInput.DICT, ""),
                )))

        res = {
            # features
            ModelInput.SEQ: [utterance.tokens for utterance in features_list],
            ModelInput.TEXT:
            features_list[-1].tokens,
            ModelInput.DICT: (
                features_list[-1].gazetteer_feats,
                features_list[-1].gazetteer_feat_weights,
                features_list[-1].gazetteer_feat_lengths,
            ),
            ModelInput.CHAR:
            features_list[-1].characters,
            ModelInput.PRETRAINED:
            features_list[-1].pretrained_token_embedding,
            # labels
            DocLabelConfig._name:
            row_data[RawData.DOC_LABEL],
            # extra data
            # TODO move the logic to FloatField
            ExtraField.DOC_WEIGHT:
            row_data.get(RawData.DOC_WEIGHT) or 1.0,
            ExtraField.WORD_WEIGHT:
            row_data.get(RawData.WORD_WEIGHT) or 1.0,
            ExtraField.RAW_WORD_LABEL:
            row_data[RawData.WORD_LABEL],
            ExtraField.UTTERANCE:
            row_data[RawData.TEXT],
            ExtraField.TOKEN_RANGE:
            features_list[-1].token_ranges,
        }
        if WordLabelConfig._name in self.labels:
            # TODO move it into word label field
            res[WordLabelConfig._name] = data.align_slot_labels(
                features_list[-1].token_ranges,
                row_data[RawData.WORD_LABEL],
                self.labels[WordLabelConfig._name].use_bio_labels,
            )
        return res
Beispiel #4
0
 def test_align_slot_labels_with_none_label(self):
     self.assertEqual(align_slot_labels([[0, 4], [5, 8]], None, True),
                      "NoLabel NoLabel")