def test_split_with_regex(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(split_regex=r"[\s,;!.?\"\(\)\-]+"), FeatureConfig()) sentence = """ Your bones don't break, mine do. That's clear. Your cells react to bacteria and viruses differently than mine. You don't get sick, I do. That's also clear. But for some reason, you and I react the exact same way to water. We swallow it too fast, we choke. We get some in our lungs, we drown. However unreal it may seem, we are connected, you and I. We're on the same curve, just on opposite ends. """ expected = """ your bones don't break mine do that's clear your cells react to bacteria and viruses differently than mine you don't get sick i do that's also clear but for some reason you and i react the exact same way to water we swallow it too fast we choke we get some in our lungs we drown however unreal it may seem we are connected you and i we're on the same curve just on opposite ends """.split() tokens = featurizer.featurize(InputRecord(raw_text=sentence)).tokens self.assertListEqual(expected, tokens) sentence = '"Please, buy me a coffee?" He implored-in vain.' expected = "please buy me a coffee he implored in vain".split() tokens = featurizer.featurize(InputRecord(raw_text=sentence)).tokens self.assertListEqual(expected, tokens)
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: return { ModelInput.TEXT1: self.featurizer.featurize( InputRecord(raw_text=row_data[RawData.TEXT1])).tokens, ModelInput.TEXT2: self.featurizer.featurize( InputRecord(raw_text=row_data[RawData.TEXT2])).tokens, DocLabelConfig._name: row_data[RawData.DOC_LABEL], ExtraField.UTTERANCE_PAIR: f"{row_data[RawData.TEXT1]} | {row_data[RawData.TEXT2]}", }
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: return { ModelInput.POS_RESPONSE: self.featurizer.featurize( InputRecord( raw_text=row_data[ModelInput.POS_RESPONSE])).tokens, ModelInput.NEG_RESPONSE: self.featurizer.featurize( InputRecord( raw_text=row_data[ModelInput.NEG_RESPONSE])).tokens, ModelInput.QUERY: self.featurizer.featurize( InputRecord(raw_text=row_data[ModelInput.QUERY])).tokens, }
def test_tokenize(self): featurizer = SimpleFeaturizer.from_config(SimpleFeaturizer.Config(), FeatureConfig()) tokens = featurizer.featurize( InputRecord(raw_text="At eight o'clock")).tokens self.assertEqual(['at', 'eight', "o'clock"], tokens)
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: features = self.featurizer.featurize( InputRecord( raw_text=row_data.get(RawData.TEXT, ""), raw_gazetteer_feats=row_data.get(RawData.DICT_FEAT, ""), )) res = { # feature ModelInput.WORD_FEAT: self._get_tokens(features), ModelInput.DICT_FEAT: ( features.gazetteer_feats, features.gazetteer_feat_weights, features.gazetteer_feat_lengths, ), ModelInput.CHAR_FEAT: features.characters, ModelInput.PRETRAINED_MODEL_EMBEDDING: features.pretrained_token_embedding, ModelInput.DENSE_FEAT: row_data.get(ModelInput.DENSE_FEAT), # target DocLabelConfig._name: row_data.get(RawData.DOC_LABEL), # extra data ExtraField.RAW_TEXT: row_data.get(RawData.TEXT), } return res
def test_convert_to_bytes(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(convert_to_bytes=True, lowercase_tokens=False), FeatureConfig(), ) tokens = featurizer.featurize( InputRecord(raw_text=self.sentence)).tokens self.assertListEqual( tokens, [ "O", "r", "d", "e", "r", " ", "m", "e", " ", "a", " ", "c", "o", "f", "f", "e", "e", ], )
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: features = self.featurizer.featurize( InputRecord( raw_text=row_data.get(RawData.TEXT, ""), raw_gazetteer_feats=row_data.get(RawData.DICT_FEAT, ""), )) tokens = self._get_tokens(features) res = { # feature ModelInput.WORD_FEAT: tokens, ModelInput.DICT_FEAT: ( features.gazetteer_feats, features.gazetteer_feat_weights, features.gazetteer_feat_lengths, ), ModelInput.CHAR_FEAT: self._get_chars(features), ModelInput.CONTEXTUAL_TOKEN_EMBEDDING: features.contextual_token_embedding, ModelInput.DENSE_FEAT: row_data.get(ModelInput.DENSE_FEAT), # target DocLabelConfig._name: row_data.get(RawData.DOC_LABEL), # extra data ExtraField.RAW_TEXT: row_data.get(RawData.TEXT), DatasetFieldName.NUM_TOKENS: len(tokens), } if Target.TARGET_PROB_FIELD in self.labels: self._add_target_prob_to_res(res, row_data) return res
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: features = self.featurizer.featurize( InputRecord( raw_text=row_data.get(RawData.TEXT, ""), raw_gazetteer_feats=row_data.get(RawData.DICT_FEAT, ""), )) res = { # feature ModelInput.WORD_FEAT: self._get_tokens(features), ModelInput.DICT_FEAT: ( features.gazetteer_feats, features.gazetteer_feat_weights, features.gazetteer_feat_lengths, ), ModelInput.CHAR_FEAT: features.characters, ModelInput.PRETRAINED_MODEL_EMBEDDING: features.pretrained_token_embedding, # target DocLabelConfig._name: row_data.get(RawData.DOC_LABEL), # extra data ExtraField.RAW_TEXT: row_data.get(RawData.TEXT), } if Target.TARGET_PROB_FIELD in self.labels: res[Target.TARGET_PROB_FIELD] = parse_json_array( row_data[RawData.TARGET_PROBS]) res[Target.TARGET_LABEL_FIELD] = parse_json_array( row_data[RawData.TARGET_LABELS]) res[Target.TARGET_LOGITS_FIELD] = parse_json_array( row_data[RawData.TARGET_LOGITS]) return res
def test_tokenize_add_sentence_markers(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(sentence_markers=("<s>", "</s>")), FeatureConfig()) tokens = featurizer.featurize( InputRecord(raw_text=self.sentence)).tokens self.assertListEqual(tokens, ["<s>", "order", "me", "a", "coffee", "</s>"])
def test_tokenize_dont_lowercase(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig() ) features = featurizer.featurize(InputRecord(raw_text=self.sentence)) expected_tokens = ["Order", "me", "a", "coffee"] expected_chars = [list(tok) for tok in expected_tokens] self.assertListEqual(features.tokens, expected_tokens) self.assertListEqual(features.characters, expected_chars)
def test_convert_to_bytes(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(convert_to_bytes=True, lowercase_tokens=False), FeatureConfig(), ) features = featurizer.featurize(InputRecord(raw_text=self.sentence)) expected_tokens = list("Order me a coffee") expected_chars = [list(char) for char in expected_tokens] self.assertListEqual(features.tokens, expected_tokens) self.assertListEqual(features.characters, expected_chars)
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: utterance = row_data.get(DFColumn.UTTERANCE, "") features = self.featurizer.featurize( InputRecord( raw_text=utterance, raw_gazetteer_feats=row_data.get(DFColumn.DICT_FEAT, ""), )) actions = "" # training time if DFColumn.SEQLOGICAL in row_data: annotation = Annotation(row_data[DFColumn.SEQLOGICAL]) actions = annotation.tree.to_actions() # Seqlogical format is required for building the tree representation of # compositional utterances and, it depends on tokenization. # Here during preprocessing, if the tokens produced from Featurizer # and those from the seqlogical format are not consistent, then it leads # to inconsistent non terminals and actions which in turn leads to # the model's forward method throwing an exception. # This should NOT happen but the check below is to make sure the # model training doesn't fail just in case there's inconsistency. tokens_from_seqlogical = annotation.tree.list_tokens() try: assert len(features.tokens) == len(tokens_from_seqlogical) for t1, t2 in zip(features.tokens, tokens_from_seqlogical): assert t1.lower() == t2.lower() except AssertionError: print( "\nTokens from Featurizer and Seqlogical format are not same " + f'for the utterance "{utterance}"') print( f"{len(features.tokens)} tokens from Featurizer: {features.tokens}" ) print( f"{len(tokens_from_seqlogical)} tokens from Seqlogical format: " + f"{tokens_from_seqlogical}") return {} return { DatasetFieldName.TEXT_FIELD: features.tokens, DatasetFieldName.DICT_FIELD: ( features.gazetteer_feats, features.gazetteer_feat_weights, features.gazetteer_feat_lengths, ), ACTION_FEATURE_FIELD: actions, ACTION_LABEL_FIELD: copy.deepcopy(actions), DatasetFieldName.TOKENS: features.tokens, DatasetFieldName.UTTERANCE_FIELD: utterance, }
def preprocess_row(self, row_data: Dict[str, Any]) -> List[str]: """ Preprocess steps for a single input row. Args: row_data (Dict[str, Any]): Dict representing the input row and columns. Returns: List[str]: List of tokens. """ return self.featurizer.featurize( InputRecord(raw_text=row_data[DFColumn.UTTERANCE])).tokens
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: sequence = data_utils.parse_json_array(row_data[DFColumn.UTTERANCE]) features_list = [ self.featurizer.featurize(InputRecord(raw_text=utterance)) for utterance in sequence ] return { # features DatasetFieldName.TEXT_FIELD: [utterance.tokens for utterance in features_list], # labels DatasetFieldName.DOC_LABEL_FIELD: row_data[DFColumn.DOC_LABEL], DatasetFieldName.UTTERANCE_FIELD: row_data[DFColumn.UTTERANCE], }
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: sequence = data.parse_json_array(row_data[DFColumn.UTTERANCE]) features_list = [ self.featurizer.featurize(InputRecord(raw_text=utterance)) for utterance in sequence ] res = { # features ModelInput.WORD_FEAT: [utterance.tokens for utterance in features_list], # labels DatasetFieldName.DOC_LABEL_FIELD: row_data[DFColumn.DOC_LABEL], DatasetFieldName.UTTERANCE_FIELD: row_data[DFColumn.UTTERANCE], } if DFColumn.DENSE_FEAT in row_data: res[ModelInput.DENSE_FEAT] = row_data.get(DFColumn.DENSE_FEAT) return res
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: """ Preprocess steps for a single input row. Args: row_data (Dict[str, Any]): Dict representing the input row and columns. Returns: Dict[str, Any]: Dictionary with feature names as keys and feature values. """ raw_input = InputRecord(raw_text=row_data[DFColumn.UTTERANCE]) features = self.featurizer.featurize(raw_input) return { # features DatasetFieldName.TEXT_FIELD: features.tokens, DatasetFieldName.UTTERANCE_FIELD: row_data[DFColumn.UTTERANCE], }
def featurize(self, row_data: Dict[str, Any]): return self.featurizer.featurize( InputRecord( raw_text=row_data.get(DFColumn.UTTERANCE, ""), raw_gazetteer_feats=row_data.get(DFColumn.DICT_FEAT, ""), ))
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: """Preprocess steps for a single input row: 1. apply tokenization to a sequence of utterances; 2. process dictionary features to align with the last utterance. 3. align word labels with the last utterance. Args: row_data (Dict[str, Any]): Dict of one row data with column names as keys. Keys includes "doc_label", "word_label", "text", "dict_feat", "word weight" and "doc weight". Returns: Dict[str, Any]: Preprocessed dict of one row data includes: "seq_word_feat" (list of list of string) tokenized words of sequence of utterances "word_feat" (list of string) tokenized words of last utterance "raw_word_label" (string) raw word label "token_range" (list of tuple) token ranges of word labels, each tuple contains the start position index and the end position index "utterance" (list of string) raw utterances "word_label" (list of string) list of labels of words in last utterance "doc_label" (string) doc label for intent classification "word_weight" (float) weight of word label "doc_weight" (float) weight of document label "dict_feat" (tuple, optional) tuple of three lists, the first is the label of each words, the second is the weight of the feature, the third is the length of the feature. """ sequence = data.parse_json_array(row_data[RawData.TEXT]) # ignore dictionary feature for context sentences other than the last one features_list = [ self.featurizer.featurize(InputRecord(raw_text=utterance)) for utterance in sequence[:-1] ] # adding dictionary feature for the last (current) message features_list.append( self.featurizer.featurize( InputRecord( raw_text=sequence[-1], raw_gazetteer_feats=row_data.get(ModelInput.DICT, ""), ))) res = { # features ModelInput.SEQ: [utterance.tokens for utterance in features_list], ModelInput.TEXT: features_list[-1].tokens, ModelInput.DICT: ( features_list[-1].gazetteer_feats, features_list[-1].gazetteer_feat_weights, features_list[-1].gazetteer_feat_lengths, ), ModelInput.CHAR: features_list[-1].characters, ModelInput.PRETRAINED: features_list[-1].pretrained_token_embedding, # labels DocLabelConfig._name: row_data[RawData.DOC_LABEL], # extra data # TODO move the logic to FloatField ExtraField.DOC_WEIGHT: row_data.get(RawData.DOC_WEIGHT) or 1.0, ExtraField.WORD_WEIGHT: row_data.get(RawData.WORD_WEIGHT) or 1.0, ExtraField.RAW_WORD_LABEL: row_data[RawData.WORD_LABEL], ExtraField.UTTERANCE: row_data[RawData.TEXT], ExtraField.TOKEN_RANGE: features_list[-1].token_ranges, } if WordLabelConfig._name in self.labels: # TODO move it into word label field res[WordLabelConfig._name] = data.align_slot_labels( features_list[-1].token_ranges, row_data[RawData.WORD_LABEL], self.labels[WordLabelConfig._name].use_bio_labels, ) return res
def test_tokenize_dont_lowercase(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig()) tokens = featurizer.featurize( InputRecord(raw_text=self.sentence)).tokens self.assertListEqual(tokens, ["Order", "me", "a", "coffee"])