def test_split_with_regex(self):
        featurizer = SimpleFeaturizer.from_config(
            SimpleFeaturizer.Config(split_regex=r"[\s,;!.?\"\(\)\-]+"),
            FeatureConfig())
        sentence = """
            Your bones don't break, mine do. That's clear. Your cells react to
            bacteria and viruses differently than mine. You don't get sick,
            I do. That's also clear. But for some reason, you and I react the
            exact same way to water. We swallow it too fast, we choke. We get
            some in our lungs, we drown. However unreal it may seem, we are
            connected, you and I. We're on the same curve, just on opposite
            ends.
        """
        expected = """
            your bones don't break mine do that's clear your cells react to
            bacteria and viruses differently than mine you don't get sick
            i do that's also clear but for some reason you and i react the
            exact same way to water we swallow it too fast we choke we get
            some in our lungs we drown however unreal it may seem we are
            connected you and i we're on the same curve just on opposite ends
        """.split()
        tokens = featurizer.featurize(InputRecord(raw_text=sentence)).tokens
        self.assertListEqual(expected, tokens)

        sentence = '"Please, buy me a coffee?" He implored-in vain.'
        expected = "please buy me a coffee he implored in vain".split()
        tokens = featurizer.featurize(InputRecord(raw_text=sentence)).tokens
        self.assertListEqual(expected, tokens)
 def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
     return {
         ModelInput.TEXT1:
         self.featurizer.featurize(
             InputRecord(raw_text=row_data[RawData.TEXT1])).tokens,
         ModelInput.TEXT2:
         self.featurizer.featurize(
             InputRecord(raw_text=row_data[RawData.TEXT2])).tokens,
         DocLabelConfig._name:
         row_data[RawData.DOC_LABEL],
         ExtraField.UTTERANCE_PAIR:
         f"{row_data[RawData.TEXT1]} | {row_data[RawData.TEXT2]}",
     }
Beispiel #3
0
 def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
     return {
         ModelInput.POS_RESPONSE:
         self.featurizer.featurize(
             InputRecord(
                 raw_text=row_data[ModelInput.POS_RESPONSE])).tokens,
         ModelInput.NEG_RESPONSE:
         self.featurizer.featurize(
             InputRecord(
                 raw_text=row_data[ModelInput.NEG_RESPONSE])).tokens,
         ModelInput.QUERY:
         self.featurizer.featurize(
             InputRecord(raw_text=row_data[ModelInput.QUERY])).tokens,
     }
Beispiel #4
0
    def test_tokenize(self):
        featurizer = SimpleFeaturizer.from_config(SimpleFeaturizer.Config(),
                                                  FeatureConfig())

        tokens = featurizer.featurize(
            InputRecord(raw_text="At eight o'clock")).tokens
        self.assertEqual(['at', 'eight', "o'clock"], tokens)
Beispiel #5
0
 def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
     features = self.featurizer.featurize(
         InputRecord(
             raw_text=row_data.get(RawData.TEXT, ""),
             raw_gazetteer_feats=row_data.get(RawData.DICT_FEAT, ""),
         ))
     res = {
         # feature
         ModelInput.WORD_FEAT:
         self._get_tokens(features),
         ModelInput.DICT_FEAT: (
             features.gazetteer_feats,
             features.gazetteer_feat_weights,
             features.gazetteer_feat_lengths,
         ),
         ModelInput.CHAR_FEAT:
         features.characters,
         ModelInput.PRETRAINED_MODEL_EMBEDDING:
         features.pretrained_token_embedding,
         ModelInput.DENSE_FEAT:
         row_data.get(ModelInput.DENSE_FEAT),
         # target
         DocLabelConfig._name:
         row_data.get(RawData.DOC_LABEL),
         # extra data
         ExtraField.RAW_TEXT:
         row_data.get(RawData.TEXT),
     }
     return res
Beispiel #6
0
 def test_convert_to_bytes(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(convert_to_bytes=True,
                                 lowercase_tokens=False),
         FeatureConfig(),
     )
     tokens = featurizer.featurize(
         InputRecord(raw_text=self.sentence)).tokens
     self.assertListEqual(
         tokens,
         [
             "O",
             "r",
             "d",
             "e",
             "r",
             " ",
             "m",
             "e",
             " ",
             "a",
             " ",
             "c",
             "o",
             "f",
             "f",
             "e",
             "e",
         ],
     )
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        features = self.featurizer.featurize(
            InputRecord(
                raw_text=row_data.get(RawData.TEXT, ""),
                raw_gazetteer_feats=row_data.get(RawData.DICT_FEAT, ""),
            ))
        tokens = self._get_tokens(features)
        res = {
            # feature
            ModelInput.WORD_FEAT:
            tokens,
            ModelInput.DICT_FEAT: (
                features.gazetteer_feats,
                features.gazetteer_feat_weights,
                features.gazetteer_feat_lengths,
            ),
            ModelInput.CHAR_FEAT:
            self._get_chars(features),
            ModelInput.CONTEXTUAL_TOKEN_EMBEDDING:
            features.contextual_token_embedding,
            ModelInput.DENSE_FEAT:
            row_data.get(ModelInput.DENSE_FEAT),
            # target
            DocLabelConfig._name:
            row_data.get(RawData.DOC_LABEL),
            # extra data
            ExtraField.RAW_TEXT:
            row_data.get(RawData.TEXT),
            DatasetFieldName.NUM_TOKENS:
            len(tokens),
        }
        if Target.TARGET_PROB_FIELD in self.labels:
            self._add_target_prob_to_res(res, row_data)

        return res
Beispiel #8
0
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        features = self.featurizer.featurize(
            InputRecord(
                raw_text=row_data.get(RawData.TEXT, ""),
                raw_gazetteer_feats=row_data.get(RawData.DICT_FEAT, ""),
            ))
        res = {
            # feature
            ModelInput.WORD_FEAT:
            self._get_tokens(features),
            ModelInput.DICT_FEAT: (
                features.gazetteer_feats,
                features.gazetteer_feat_weights,
                features.gazetteer_feat_lengths,
            ),
            ModelInput.CHAR_FEAT:
            features.characters,
            ModelInput.PRETRAINED_MODEL_EMBEDDING:
            features.pretrained_token_embedding,
            # target
            DocLabelConfig._name:
            row_data.get(RawData.DOC_LABEL),
            # extra data
            ExtraField.RAW_TEXT:
            row_data.get(RawData.TEXT),
        }
        if Target.TARGET_PROB_FIELD in self.labels:
            res[Target.TARGET_PROB_FIELD] = parse_json_array(
                row_data[RawData.TARGET_PROBS])
            res[Target.TARGET_LABEL_FIELD] = parse_json_array(
                row_data[RawData.TARGET_LABELS])
            res[Target.TARGET_LOGITS_FIELD] = parse_json_array(
                row_data[RawData.TARGET_LOGITS])

        return res
 def test_tokenize_add_sentence_markers(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(sentence_markers=("<s>", "</s>")),
         FeatureConfig())
     tokens = featurizer.featurize(
         InputRecord(raw_text=self.sentence)).tokens
     self.assertListEqual(tokens,
                          ["<s>", "order", "me", "a", "coffee", "</s>"])
Beispiel #10
0
 def test_tokenize_dont_lowercase(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig()
     )
     features = featurizer.featurize(InputRecord(raw_text=self.sentence))
     expected_tokens = ["Order", "me", "a", "coffee"]
     expected_chars = [list(tok) for tok in expected_tokens]
     self.assertListEqual(features.tokens, expected_tokens)
     self.assertListEqual(features.characters, expected_chars)
Beispiel #11
0
 def test_convert_to_bytes(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(convert_to_bytes=True, lowercase_tokens=False),
         FeatureConfig(),
     )
     features = featurizer.featurize(InputRecord(raw_text=self.sentence))
     expected_tokens = list("Order me a coffee")
     expected_chars = [list(char) for char in expected_tokens]
     self.assertListEqual(features.tokens, expected_tokens)
     self.assertListEqual(features.characters, expected_chars)
Beispiel #12
0
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        utterance = row_data.get(DFColumn.UTTERANCE, "")
        features = self.featurizer.featurize(
            InputRecord(
                raw_text=utterance,
                raw_gazetteer_feats=row_data.get(DFColumn.DICT_FEAT, ""),
            ))
        actions = ""
        # training time
        if DFColumn.SEQLOGICAL in row_data:
            annotation = Annotation(row_data[DFColumn.SEQLOGICAL])
            actions = annotation.tree.to_actions()

            # Seqlogical format is required for building the tree representation of
            # compositional utterances and, it depends on tokenization.
            # Here during preprocessing, if the tokens produced from Featurizer
            # and those from the seqlogical format are not consistent, then it leads
            # to inconsistent non terminals and actions which in turn leads to
            # the model's forward method throwing an exception.
            # This should NOT happen but the check below is to make sure the
            # model training doesn't fail just in case there's inconsistency.
            tokens_from_seqlogical = annotation.tree.list_tokens()
            try:
                assert len(features.tokens) == len(tokens_from_seqlogical)
                for t1, t2 in zip(features.tokens, tokens_from_seqlogical):
                    assert t1.lower() == t2.lower()
            except AssertionError:
                print(
                    "\nTokens from Featurizer and Seqlogical format are not same "
                    + f'for the utterance "{utterance}"')
                print(
                    f"{len(features.tokens)} tokens from Featurizer: {features.tokens}"
                )
                print(
                    f"{len(tokens_from_seqlogical)} tokens from Seqlogical format: "
                    + f"{tokens_from_seqlogical}")
                return {}

        return {
            DatasetFieldName.TEXT_FIELD:
            features.tokens,
            DatasetFieldName.DICT_FIELD: (
                features.gazetteer_feats,
                features.gazetteer_feat_weights,
                features.gazetteer_feat_lengths,
            ),
            ACTION_FEATURE_FIELD:
            actions,
            ACTION_LABEL_FIELD:
            copy.deepcopy(actions),
            DatasetFieldName.TOKENS:
            features.tokens,
            DatasetFieldName.UTTERANCE_FIELD:
            utterance,
        }
    def preprocess_row(self, row_data: Dict[str, Any]) -> List[str]:
        """
        Preprocess steps for a single input row.

        Args:
            row_data (Dict[str, Any]): Dict representing the input row and
                columns.

        Returns:
            List[str]: List of tokens.
        """
        return self.featurizer.featurize(
            InputRecord(raw_text=row_data[DFColumn.UTTERANCE])).tokens
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        sequence = data_utils.parse_json_array(row_data[DFColumn.UTTERANCE])

        features_list = [
            self.featurizer.featurize(InputRecord(raw_text=utterance))
            for utterance in sequence
        ]

        return {
            # features
            DatasetFieldName.TEXT_FIELD:
            [utterance.tokens for utterance in features_list],
            # labels
            DatasetFieldName.DOC_LABEL_FIELD:
            row_data[DFColumn.DOC_LABEL],
            DatasetFieldName.UTTERANCE_FIELD:
            row_data[DFColumn.UTTERANCE],
        }
Beispiel #15
0
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        sequence = data.parse_json_array(row_data[DFColumn.UTTERANCE])

        features_list = [
            self.featurizer.featurize(InputRecord(raw_text=utterance))
            for utterance in sequence
        ]
        res = {
            # features
            ModelInput.WORD_FEAT:
            [utterance.tokens for utterance in features_list],
            # labels
            DatasetFieldName.DOC_LABEL_FIELD:
            row_data[DFColumn.DOC_LABEL],
            DatasetFieldName.UTTERANCE_FIELD:
            row_data[DFColumn.UTTERANCE],
        }
        if DFColumn.DENSE_FEAT in row_data:
            res[ModelInput.DENSE_FEAT] = row_data.get(DFColumn.DENSE_FEAT)
        return res
Beispiel #16
0
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Preprocess steps for a single input row.

        Args:
            row_data (Dict[str, Any]): Dict representing the input row and
                columns.

        Returns:
            Dict[str, Any]: Dictionary with feature names as keys and feature
                values.
        """
        raw_input = InputRecord(raw_text=row_data[DFColumn.UTTERANCE])

        features = self.featurizer.featurize(raw_input)

        return {
            # features
            DatasetFieldName.TEXT_FIELD: features.tokens,
            DatasetFieldName.UTTERANCE_FIELD: row_data[DFColumn.UTTERANCE],
        }
 def featurize(self, row_data: Dict[str, Any]):
     return self.featurizer.featurize(
         InputRecord(
             raw_text=row_data.get(DFColumn.UTTERANCE, ""),
             raw_gazetteer_feats=row_data.get(DFColumn.DICT_FEAT, ""),
         ))
    def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
        """Preprocess steps for a single input row: 1. apply tokenization to a
        sequence of utterances; 2. process dictionary features to align with
        the last utterance. 3. align word labels with the last utterance.

        Args:
            row_data (Dict[str, Any]): Dict of one row data with column names as keys.
                Keys includes "doc_label", "word_label", "text", "dict_feat",
                "word weight" and "doc weight".

        Returns:
            Dict[str, Any]: Preprocessed dict of one row data includes:

                "seq_word_feat" (list of list of string)
                    tokenized words of sequence of utterances
                "word_feat" (list of string)
                    tokenized words of last utterance
                "raw_word_label" (string)
                    raw word label
                "token_range" (list of tuple)
                    token ranges of word labels, each tuple contains the start
                    position index and the end position index
                "utterance" (list of string)
                    raw utterances
                "word_label" (list of string)
                    list of labels of words in last utterance
                "doc_label" (string)
                    doc label for intent classification
                "word_weight" (float)
                    weight of word label
                "doc_weight" (float)
                    weight of document label
                "dict_feat" (tuple, optional)
                    tuple of three lists, the first is the label of each words,
                    the second is the weight of the feature, the third is the
                    length of the feature.

        """
        sequence = data.parse_json_array(row_data[RawData.TEXT])

        # ignore dictionary feature for context sentences other than the last one
        features_list = [
            self.featurizer.featurize(InputRecord(raw_text=utterance))
            for utterance in sequence[:-1]
        ]

        # adding dictionary feature for the last (current) message
        features_list.append(
            self.featurizer.featurize(
                InputRecord(
                    raw_text=sequence[-1],
                    raw_gazetteer_feats=row_data.get(ModelInput.DICT, ""),
                )))

        res = {
            # features
            ModelInput.SEQ: [utterance.tokens for utterance in features_list],
            ModelInput.TEXT:
            features_list[-1].tokens,
            ModelInput.DICT: (
                features_list[-1].gazetteer_feats,
                features_list[-1].gazetteer_feat_weights,
                features_list[-1].gazetteer_feat_lengths,
            ),
            ModelInput.CHAR:
            features_list[-1].characters,
            ModelInput.PRETRAINED:
            features_list[-1].pretrained_token_embedding,
            # labels
            DocLabelConfig._name:
            row_data[RawData.DOC_LABEL],
            # extra data
            # TODO move the logic to FloatField
            ExtraField.DOC_WEIGHT:
            row_data.get(RawData.DOC_WEIGHT) or 1.0,
            ExtraField.WORD_WEIGHT:
            row_data.get(RawData.WORD_WEIGHT) or 1.0,
            ExtraField.RAW_WORD_LABEL:
            row_data[RawData.WORD_LABEL],
            ExtraField.UTTERANCE:
            row_data[RawData.TEXT],
            ExtraField.TOKEN_RANGE:
            features_list[-1].token_ranges,
        }
        if WordLabelConfig._name in self.labels:
            # TODO move it into word label field
            res[WordLabelConfig._name] = data.align_slot_labels(
                features_list[-1].token_ranges,
                row_data[RawData.WORD_LABEL],
                self.labels[WordLabelConfig._name].use_bio_labels,
            )
        return res
 def test_tokenize_dont_lowercase(self):
     featurizer = SimpleFeaturizer.from_config(
         SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig())
     tokens = featurizer.featurize(
         InputRecord(raw_text=self.sentence)).tokens
     self.assertListEqual(tokens, ["Order", "me", "a", "coffee"])