def test_uppercase_tokens(self): """ Test that the text is not lower-cased when lowercase_tokens is False. """ custom_dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=1)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig()), ) custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data, self.test_data) self.assertSetEqual( set(custom_dh.features["word_feat"].vocab.stoi), { "<unk>", "What", "EVENTS", "can", "I", "go", "today", "Are", "there", "any", "adult", "events", "this", "weekend", }, )
def setUp(self): self.data_handler = JointModelDataHandler.from_config( JointModelDataHandler.Config(), FeatureConfig(), [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(), FeatureConfig()), )
def _init_data_handler(self): data_handler = LanguageModelDataHandler.from_config( LanguageModelDataHandler.Config(), FeatureConfig(), WordLabelConfig(), featurizer=create_featurizer(SimpleFeaturizer.Config(), FeatureConfig()), shuffle=False, ) data_handler.init_metadata_from_path(FILE_NAME, FILE_NAME, FILE_NAME) return data_handler
def test_intializing_embeds_from_config(self): feature_config = FeatureConfig( word_feat=WordFeatConfig( embedding_init_strategy=EmbedInitStrategy.RANDOM, embed_dim=5, pretrained_embeddings_path=tests_module.TEST_BASE_DIR, ) ) data_handler = JointModelDataHandler.from_config( JointModelDataHandler.Config(), feature_config, [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(), feature_config ), ) data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE) pretrained_embeds = data_handler.metadata.features[ DatasetFieldName.TEXT_FIELD ].pretrained_embeds_weight # test random initialization (values should be non-0) np.testing.assert_array_less( [0, 0, 0, 0, 0], np.absolute(pretrained_embeds[11].numpy()) ) feature_config = FeatureConfig( word_feat=WordFeatConfig( embedding_init_strategy=EmbedInitStrategy.ZERO, embed_dim=5, pretrained_embeddings_path=tests_module.TEST_BASE_DIR, ) ) data_handler = JointModelDataHandler.from_config( JointModelDataHandler.Config(), feature_config, [DocLabelConfig(), WordLabelConfig()], featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(), feature_config ), ) data_handler.init_metadata_from_path(TRAIN_FILE, EVAL_FILE, TEST_FILE) pretrained_embeds = data_handler.metadata.features[ DatasetFieldName.TEXT_FIELD ].pretrained_embeds_weight # test zero initialization (values should all be 0) np.testing.assert_array_equal([0, 0, 0, 0, 0], pretrained_embeds[11].numpy())
def _create_dummy_data_handler(self): feat = WordFeatConfig( vocab_size=4, vocab_from_all_data=True, vocab_from_train_data=True, vocab_from_pretrained_embeddings=False, pretrained_embeddings_path=None, ) featurizer = create_featurizer(SimpleFeaturizer.Config(), FeatureConfig(word_feat=feat)) data_handler = DocClassificationDataHandler.from_config( DocClassificationDataHandler.Config(), ModelInputConfig(word_feat=feat), TargetConfig(), featurizer=featurizer, ) train_data = data_handler.gen_dataset([{ "text": "<pad>" }], include_label_fields=False) eval_data = data_handler.gen_dataset([{ "text": "<pad>" }], include_label_fields=False) test_data = data_handler.gen_dataset([{ "text": "<pad>" }], include_label_fields=False) data_handler.init_feature_metadata(train_data, eval_data, test_data) return data_handler
def test_split_with_regex(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(split_regex=r"[\s,;!.?\"\(\)\-]+"), FeatureConfig()) sentence = """ Your bones don't break, mine do. That's clear. Your cells react to bacteria and viruses differently than mine. You don't get sick, I do. That's also clear. But for some reason, you and I react the exact same way to water. We swallow it too fast, we choke. We get some in our lungs, we drown. However unreal it may seem, we are connected, you and I. We're on the same curve, just on opposite ends. """ expected = """ your bones don't break mine do that's clear your cells react to bacteria and viruses differently than mine you don't get sick i do that's also clear but for some reason you and i react the exact same way to water we swallow it too fast we choke we get some in our lungs we drown however unreal it may seem we are connected you and i we're on the same curve just on opposite ends """.split() tokens = featurizer.featurize(InputRecord(raw_text=sentence)).tokens self.assertListEqual(expected, tokens) sentence = '"Please, buy me a coffee?" He implored-in vain.' expected = "please buy me a coffee he implored in vain".split() tokens = featurizer.featurize(InputRecord(raw_text=sentence)).tokens self.assertListEqual(expected, tokens)
def test_read_partially_from_csv(self): file_name = tests_module.test_file("train_data_tiny.tsv") columns = {DFColumn.DOC_LABEL: 0, DFColumn.UTTERANCE: 2} feat = WordFeatConfig( vocab_from_all_data=True, vocab_from_train_data=False, vocab_from_pretrained_embeddings=False, ) featurizer = create_featurizer( SimpleFeaturizer.Config(), FeatureConfig(word_feat=feat) ) data_handler = DocClassificationDataHandler.from_config( DocClassificationDataHandler.Config(), ModelInputConfig(word_feat=feat), TargetConfig(), featurizer=featurizer, ) data = list(data_handler.read_from_file(file_name, columns)) for col in columns: self.assertTrue(col in data[0], "{} must in the data".format(col)) self.assertEqual("alarm/modify_alarm", data[0][DFColumn.DOC_LABEL]) self.assertEqual( "change my alarm tomorrow to wake me up 30 minutes earlier", data[0][DFColumn.UTTERANCE], )
def get_feature_metadata(cls, feature_config: FeatureConfig, feature_meta: Dict[str, FieldMeta]): # The number of names in input_names *must* be equal to the number of # tensors passed in dummy_input input_names: List[str] = [] dummy_model_input: List = [] feature_itos_map = {} for name, feat_config in feature_config._asdict().items(): if isinstance(feat_config, ConfigBase): input_names.extend(feat_config.export_input_names) if getattr(feature_meta[name], "vocab", None): feature_itos_map[feat_config.export_input_names[ 0]] = feature_meta[name].vocab.itos dummy_model_input.append(feature_meta[name].dummy_model_input) if "tokens_vals" in input_names: dummy_model_input.append(torch.tensor( [1, 1], dtype=torch.long)) # token lengths input_names.append("tokens_lens") if "seq_tokens_vals" in input_names: dummy_model_input.append(torch.tensor( [1, 1], dtype=torch.long)) # seq lengths input_names.append("seq_tokens_lens") return input_names, tuple(dummy_model_input), feature_itos_map
def test_tokenize(self): featurizer = SimpleFeaturizer.from_config(SimpleFeaturizer.Config(), FeatureConfig()) tokens = featurizer.featurize( InputRecord(raw_text="At eight o'clock")).tokens self.assertEqual(['at', 'eight', "o'clock"], tokens)
def test_convert_to_bytes(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(convert_to_bytes=True, lowercase_tokens=False), FeatureConfig(), ) tokens = featurizer.featurize( InputRecord(raw_text=self.sentence)).tokens self.assertListEqual( tokens, [ "O", "r", "d", "e", "r", " ", "m", "e", " ", "a", " ", "c", "o", "f", "f", "e", "e", ], )
class Config(ConfigBase): features: FeatureConfig = FeatureConfig() featurizer: Featurizer.Config = SimpleFeaturizer.Config() data_handler: DataHandler.Config trainer: Trainer.Config = Trainer.Config() optimizer: Optimizer.Config = Adam.Config() scheduler: Optional[Scheduler.Config] = Scheduler.Config() exporter: Optional[ModelExporter.Config] = None
def test_tokenize_add_sentence_markers(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(sentence_markers=("<s>", "</s>")), FeatureConfig()) tokens = featurizer.featurize( InputRecord(raw_text=self.sentence)).tokens self.assertListEqual(tokens, ["<s>", "order", "me", "a", "coffee", "</s>"])
def test_freeze_all_embedding(self): model = create_model( DocModel_Deprecated.Config(), FeatureConfig(freeze=True), metadata=mock_metadata(), ) for param in model.embedding.parameters(): self.assertFalse(param.requires_grad)
def test_tokenize_dont_lowercase(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=False), FeatureConfig() ) features = featurizer.featurize(InputRecord(raw_text=self.sentence)) expected_tokens = ["Order", "me", "a", "coffee"] expected_chars = [list(tok) for tok in expected_tokens] self.assertListEqual(features.tokens, expected_tokens) self.assertListEqual(features.characters, expected_chars)
def setUp(self): handler_config = DocClassificationDataHandler.Config() handler_config.columns_to_read.append(ModelInput.DENSE_FEAT) self.data_handler = DocClassificationDataHandler.from_config( DocClassificationDataHandler.Config(), ModelInputConfig(), [], featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(), FeatureConfig()), )
def test_min_freq(self): """ Test that UNKification is triggered when min_freq is 2. """ custom_dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=2)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=True), FeatureConfig()), ) custom_dh.init_metadata_from_raw_data(self.train_data, self.eval_data, self.test_data) # <unk>-NUM = <unk> for numeric tokens self.assertSetEqual( set(custom_dh.features["word_feat"].vocab.stoi), {"<unk>", "<unk>-NUM", "<unk>", "<unk>", "events"}, )
def test_convert_to_bytes(self): featurizer = SimpleFeaturizer.from_config( SimpleFeaturizer.Config(convert_to_bytes=True, lowercase_tokens=False), FeatureConfig(), ) features = featurizer.featurize(InputRecord(raw_text=self.sentence)) expected_tokens = list("Order me a coffee") expected_chars = [list(char) for char in expected_tokens] self.assertListEqual(features.tokens, expected_tokens) self.assertListEqual(features.characters, expected_chars)
def setUp(self): simple_featurizer_config = SimpleFeaturizer.Config() simple_featurizer_config.split_regex = r"" simple_featurizer_config.convert_to_bytes = True self.data_handler = QueryDocumentPairwiseRankingDataHandler.from_config( QueryDocumentPairwiseRankingDataHandler.Config(), ModelInputConfig(), [], featurizer=SimpleFeaturizer.from_config(simple_featurizer_config, FeatureConfig()), )
def test_data_handler(self): data_handler = BPTTLanguageModelDataHandler.from_config( BPTTLanguageModelDataHandler.Config(bptt_len=4), FeatureConfig(), WordLabelConfig(), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(), FeatureConfig() ), ) data_handler.init_metadata_from_path(FILE_NAME, FILE_NAME, FILE_NAME) train_iter = data_handler.get_train_iter_from_path(FILE_NAME, BATCH_SIZE) batches = [t for t in train_iter] # There are two batches in the tiny dataset self.assertEqual(len(batches), 2) # batches of tuple(input, target, context) # input -> tuple(input_sequences, sequence_length) # input_sequence -> tensor of dim (bsize, max_seq_length) np.testing.assert_array_equal( batches[0][0][0], [[15, 19, 12, 16], [3, 13, 21, 8], [20, 7, 23, 4], [6, 5, 7, 22]], ) # sequence_length -> tensor of dim (bsize) np.testing.assert_array_equal(batches[0][0][1], [4, 4, 4, 4]) # target -> tensor of same dim as input_sequences (bsize, max_seq_length) np.testing.assert_array_equal( batches[0][1][0], [[19, 12, 16, 14], [13, 21, 8, 3], [7, 23, 4, 3], [5, 7, 22, 10]], ) np.testing.assert_array_equal( batches[1][0][0], [[14, 17, 11], [3, 5, 18], [3, 8, 4], [10, 4, 9]] ) np.testing.assert_array_equal(batches[1][0][1], [3, 3, 3, 3]) np.testing.assert_array_equal( batches[1][1][0], [[17, 11, 4], [5, 18, 6], [8, 4, 3], [4, 9, 1]] )
def _create_dummy_model(self): return create_model( DocModel_Deprecated.Config( representation=BiLSTMDocAttention.Config( save_path=self.representation_path), decoder=MLPDecoder.Config(save_path=self.decoder_path), ), FeatureConfig( word_feat=WordEmbedding.Config( embed_dim=300, save_path=self.word_embedding_path), save_path=self.embedding_path, ), self._create_dummy_meta_data(), )
def setUp(self): self.train_data = [{ DFColumn.DOC_LABEL: "cu:discuss_where", DFColumn.UTTERANCE: '["where do you wanna meet?", "MPK"]', }] self.eval_data = [ { DFColumn.DOC_LABEL: "cu:discuss_where", DFColumn.UTTERANCE: '["how about SF?", "sounds good"]', }, { DFColumn.DOC_LABEL: "cu:other", DFColumn.UTTERANCE: '["lol"]' }, ] self.test_data = [ { DFColumn.DOC_LABEL: "cu:discuss_where", DFColumn.UTTERANCE: '["MPK sounds good to me"]', }, { DFColumn.DOC_LABEL: "cu:other", DFColumn.UTTERANCE: '["great", "awesome"]', }, ] self.dh = SeqModelDataHandler.from_config( SeqModelDataHandler.Config(), FeatureConfig(), DocLabelConfig(), featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(), FeatureConfig()), )
def setUp(self): file_name = tests_module.test_file( "knowledge_distillation_test_tiny.tsv") label_config_dict = {"target_prob": True} data_handler_dict = { "columns_to_read": ["text", "target_probs", "target_labels", "doc_label"] } self.data_handler = KDDocClassificationDataHandler.from_config( KDDocClassificationDataHandler.Config(**data_handler_dict), ModelInputConfig(), TargetConfig(**label_config_dict), featurizer=SimpleFeaturizer.from_config(SimpleFeaturizer.Config(), FeatureConfig()), ) self.data = self.data_handler.read_from_file( file_name, self.data_handler.raw_columns)
def create_language_model_data_handler(cls) -> LanguageModelDataHandler: # TODO: Refactor this after Shicong refactors PyText config and removes # Thrift. After that directly use Data Handler's from config method # with synthetic configs columns = [DFColumn.UTTERANCE] features: Dict[str, Field] = { DatasetFieldName.TEXT_FIELD: TextFeatureField( eos_token=VocabMeta.EOS_TOKEN, init_token=VocabMeta.INIT_TOKEN ) } return LanguageModelDataHandler( raw_columns=columns, features=features, labels={}, featurizer=create_featurizer(SimpleFeaturizer.Config(), FeatureConfig()), )
def DISABLED_test_freeze_word_embedding(self): model = create_model( DocModel.Config(), FeatureConfig( word_feat=WordFeatConfig(freeze=True, mlp_layer_dims=[4]), dict_feat=DictFeatConfig(), ), metadata=mock_metadata(), ) # word embedding for param in model.embedding[0].word_embedding.parameters(): self.assertFalse(param.requires_grad) for param in model.embedding[0].mlp.parameters(): self.assertTrue(param.requires_grad) # dict feat embedding for param in model.embedding[1].parameters(): self.assertTrue(param.requires_grad)
def setup_data(self): simple_featurizer_config = SimpleFeaturizer.Config() simple_featurizer_config.split_regex = r"" simple_featurizer_config.convert_to_bytes = True self.data_handler = QueryDocumentPairwiseRankingDataHandler.from_config( QueryDocumentPairwiseRankingDataHandler.Config(), ModelInputConfig(), [], featurizer=SimpleFeaturizer.from_config(simple_featurizer_config, FeatureConfig()), ) self.file_name = tests_module.test_file( "query_document_pairwise_ranking_tiny.tsv") self.data_handler.shuffle = False self.data_handler.init_metadata_from_path(self.file_name, self.file_name, self.file_name)
def _get_exportable_metadata( cls, exportable_filter: Callable, feature_config: FeatureConfig, feature_meta: Dict[str, FieldMeta], ) -> Tuple[List[str], List, Dict]: # The number of names in input_names *must* be equal to the number of # tensors passed in dummy_input input_names: List[str] = [] dummy_model_input: List = [] feature_itos_map = {} for name, feat_config in feature_config._asdict().items(): if exportable_filter(feat_config): input_names.extend(feat_config.export_input_names) if getattr(feature_meta[name], "vocab", None): feature_itos_map[feat_config.export_input_names[ 0]] = feature_meta[name].vocab.itos dummy_model_input.append(feature_meta[name].dummy_model_input) return input_names, dummy_model_input, feature_itos_map
def create_sub_embs(cls, emb_config: FeatureConfig, metadata: CommonMetadata) -> Dict[str, EmbeddingBase]: """ Creates the embedding modules defined in the `emb_config`. Args: emb_config (FeatureConfig): Object containing all the sub-embedding configurations. metadata (CommonMetadata): Object containing features and label metadata. Returns: Dict[str, EmbeddingBase]: Named dictionary of embedding modules. """ sub_emb_module_dict = {} for name, config in emb_config._asdict().items(): if issubclass(getattr(config, "__COMPONENT__", object), EmbeddingBase): sub_emb_module_dict[name] = create_module( config, metadata=metadata.features[name]) else: print(f"{name} is not a config of embedding, skipping") return sub_emb_module_dict
def test_load_save(self): text_field_meta = FieldMeta() text_field_meta.vocab = VocabStub() text_field_meta.vocab_size = 4 text_field_meta.unk_token_idx = 1 text_field_meta.pad_token_idx = 0 text_field_meta.pretrained_embeds_weight = None label_meta = FieldMeta() label_meta.vocab = VocabStub() label_meta.vocab_size = 3 metadata = CommonMetadata() metadata.features = {DatasetFieldName.TEXT_FIELD: text_field_meta} metadata.target = label_meta saved_model = create_model( DocModel.Config( representation=BiLSTMDocAttention.Config( save_path=self.representation_path), decoder=MLPDecoder.Config(save_path=self.decoder_path), ), FeatureConfig(save_path=self.embedding_path), metadata, ) saved_model.save_modules() loaded_model = create_model( DocModel.Config( representation=BiLSTMDocAttention.Config( load_path=self.representation_path), decoder=MLPDecoder.Config(load_path=self.decoder_path), ), FeatureConfig(load_path=self.embedding_path), metadata, ) random_model = create_model( DocModel.Config(representation=BiLSTMDocAttention.Config(), decoder=MLPDecoder.Config()), FeatureConfig(), metadata, ) # Loaded and saved modules should be equal. Neither should be equal to # a randomly initialised model. for p1, p2, p3 in itertools.zip_longest( saved_model.embedding.parameters(), loaded_model.embedding.parameters(), random_model.embedding.parameters(), ): self.assertTrue(p1.equal(p2)) self.assertFalse(p3.equal(p1)) self.assertFalse(p3.equal(p2)) for p1, p2, p3 in itertools.zip_longest( saved_model.representation.parameters(), loaded_model.representation.parameters(), random_model.representation.parameters(), ): self.assertTrue(p1.equal(p2)) self.assertFalse(p3.equal(p1)) self.assertFalse(p3.equal(p2)) for p1, p2, p3 in itertools.zip_longest( saved_model.decoder.parameters(), loaded_model.decoder.parameters(), random_model.decoder.parameters(), ): self.assertTrue(p1.equal(p2)) self.assertFalse(p3.equal(p1)) self.assertFalse(p3.equal(p2))
def test_init_feature_metadata(self): # Specify data feat_name = ModelInput.WORD_FEAT train_text = "Hi there you" eval_text = "" test_text = "Go away" pretrained_embedding_file = tests_module.test_file("pretrained_embed_raw") pretrained_tokens = { "</s>", "the", "to", "and", "a", "I", "you", "is", "aloha", "for", } # Specify test cases test_cases = ( # Vocab from train / eval / test data { "feat": WordFeatConfig( vocab_from_all_data=True, vocab_from_train_data=False, vocab_from_pretrained_embeddings=False, ), "expected_tokens": { "hi", "there", "you", "go", "away", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN, }, "expected_num_pretrained_tokens": 0, }, # Vocab from train data or pretrained embeddings { "feat": WordFeatConfig( vocab_from_all_data=False, vocab_from_train_data=True, vocab_from_pretrained_embeddings=True, pretrained_embeddings_path=pretrained_embedding_file, embed_dim=5, ), "expected_tokens": pretrained_tokens.union( {"hi", "there", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN} ), "expected_num_pretrained_tokens": len(pretrained_tokens) + 4, }, # Vocab from limited number of pretrained embeddings { "feat": WordFeatConfig( vocab_from_all_data=False, vocab_from_train_data=False, vocab_from_pretrained_embeddings=True, pretrained_embeddings_path=pretrained_embedding_file, embed_dim=5, vocab_size=2, ), "expected_tokens": { "</s>", "the", VocabMeta.UNK_TOKEN, VocabMeta.PAD_TOKEN, }, # special tokens excluded from vocab_size = 2 "expected_num_pretrained_tokens": 4, }, ) for case in test_cases: # Setup data handler featurizer = create_featurizer( SimpleFeaturizer.Config(), FeatureConfig(word_feat=case["feat"]) ) data_handler = DocClassificationDataHandler.from_config( DocClassificationDataHandler.Config(), ModelInputConfig(word_feat=case["feat"]), TargetConfig(), featurizer=featurizer, ) train_data = data_handler.gen_dataset( [{"text": train_text}], include_label_fields=False ) eval_data = data_handler.gen_dataset( [{"text": eval_text}], include_label_fields=False ) test_data = data_handler.gen_dataset( [{"text": test_text}], include_label_fields=False ) data_handler.init_feature_metadata(train_data, eval_data, test_data) # Check created vocab meta = data_handler.metadata.features[feat_name] self.assertEqual(set(meta.vocab.stoi.keys()), case["expected_tokens"]) if case["expected_num_pretrained_tokens"] == 0: self.assertIsNone(meta.pretrained_embeds_weight) else: self.assertEqual( meta.pretrained_embeds_weight.size(0), case["expected_num_pretrained_tokens"], )
def setUp(self): self.train_data = [{ DFColumn.DOC_LABEL: "IN:GET_EVENT", DFColumn.WORD_LABEL: [{ "id": "SL:DATE_TIME", "span": { "start": 21, "end": 26 }, "text": "today", }], DFColumn.UTTERANCE: "What EVENTS can I go today", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_EVENT What EVENTS can I go [SL:DATE_TIME today ] ]", }] self.eval_data = [{ DFColumn.DOC_LABEL: "IN:GET_EVENT", DFColumn.WORD_LABEL: [ { "id": "SL:ATTRIBUTE_EVENT", "span": { "start": 14, "end": 19 }, "text": "adult", }, { "id": "SL:DATE_TIME", "span": { "start": 27, "end": 39 }, "text": "this weekend", }, ], DFColumn.UTTERANCE: "Are there any adult events this weekend", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_EVENT Are there any [SL:ATTRIBUTE_EVENT adult ] events [SL:DATE_TIME this weekend ] ]", }] self.test_data = [{ DFColumn.DOC_LABEL: "IN:GET_INFO_ROAD_CONDITION", DFColumn.WORD_LABEL: [ { "id": "SL:ROAD_CONDITION", "span": { "start": 9, "end": 21 }, "text": "any flooding", }, { "id": "SL:DESTINATION", "span": { "start": 36, "end": 41 }, "text": "Karen", "subframe": { "utterance": "Karen", "domain": "", "intent": "IN:GET_LOCATION_HOME", "slots": [{ "id": "SL:CONTACT", "span": { "start": 0, "end": 5 }, "text": "Karen", }], "span": { "start": 0, "end": 5 }, }, }, ], DFColumn.UTTERANCE: "Is there any flooding on the way to Karen's?", DFColumn.DICT_FEAT: "", DFColumn.SEQLOGICAL: "[IN:GET_INFO_ROAD_CONDITION Is there [SL:ROAD_CONDITION any flooding ] on the way to [SL:DESTINATION [IN:GET_LOCATION_HOME [SL:CONTACT Karen 's ? ] ] ] ]", }] self.dh = CompositionalDataHandler.from_config( CompositionalDataHandler.Config(), FeatureConfig(word_feat=WordFeatConfig(vocab_from_all_data=True, min_freq=1)), featurizer=SimpleFeaturizer.from_config( SimpleFeaturizer.Config(lowercase_tokens=True), FeatureConfig()), )